dslighting 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dslighting/__init__.py +1 -1
- dslighting/core/agent.py +78 -62
- {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/METADATA +1 -1
- {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/RECORD +352 -7
- {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/top_level.txt +1 -0
- mlebench/README.md +39 -0
- mlebench/__init__.py +0 -0
- mlebench/cli.py +221 -0
- mlebench/competitions/3d-object-detection-for-autonomous-vehicles/grade.py +161 -0
- mlebench/competitions/3d-object-detection-for-autonomous-vehicles/mAP_evaluation.py +425 -0
- mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare.py +483 -0
- mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare_val.py +719 -0
- mlebench/competitions/AI4Code/grade.py +70 -0
- mlebench/competitions/AI4Code/prepare.py +84 -0
- mlebench/competitions/AI4Code/prepare_val.py +159 -0
- mlebench/competitions/__init__.py +0 -0
- mlebench/competitions/aerial-cactus-identification/grade.py +11 -0
- mlebench/competitions/aerial-cactus-identification/prepare.py +71 -0
- mlebench/competitions/aerial-cactus-identification/prepare_val.py +133 -0
- mlebench/competitions/alaska2-image-steganalysis/grade.py +136 -0
- mlebench/competitions/alaska2-image-steganalysis/prepare.py +88 -0
- mlebench/competitions/alaska2-image-steganalysis/prepare_val.py +148 -0
- mlebench/competitions/aptos2019-blindness-detection/grade.py +35 -0
- mlebench/competitions/aptos2019-blindness-detection/prepare.py +75 -0
- mlebench/competitions/aptos2019-blindness-detection/prepare_val.py +123 -0
- mlebench/competitions/bike-sharing-demand/__init__.py +0 -0
- mlebench/competitions/bike-sharing-demand/grade.py +55 -0
- mlebench/competitions/bike-sharing-demand/prepare.py +37 -0
- mlebench/competitions/billion-word-imputation/grade.py +37 -0
- mlebench/competitions/billion-word-imputation/prepare.py +107 -0
- mlebench/competitions/billion-word-imputation/prepare_val.py +179 -0
- mlebench/competitions/bms-molecular-translation/grade.py +40 -0
- mlebench/competitions/bms-molecular-translation/prepare.py +68 -0
- mlebench/competitions/bms-molecular-translation/prepare_val.py +131 -0
- mlebench/competitions/cassava-leaf-disease-classification/grade.py +12 -0
- mlebench/competitions/cassava-leaf-disease-classification/prepare.py +113 -0
- mlebench/competitions/cassava-leaf-disease-classification/prepare_val.py +186 -0
- mlebench/competitions/cdiscount-image-classification-challenge/grade.py +11 -0
- mlebench/competitions/cdiscount-image-classification-challenge/prepare.py +144 -0
- mlebench/competitions/cdiscount-image-classification-challenge/prepare_val.py +205 -0
- mlebench/competitions/chaii-hindi-and-tamil-question-answering/grade.py +67 -0
- mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare.py +31 -0
- mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare_val.py +94 -0
- mlebench/competitions/champs-scalar-coupling/grade.py +60 -0
- mlebench/competitions/champs-scalar-coupling/prepare.py +116 -0
- mlebench/competitions/champs-scalar-coupling/prepare_val.py +155 -0
- mlebench/competitions/conways-reverse-game-of-life-2020/__init__.py +0 -0
- mlebench/competitions/conways-reverse-game-of-life-2020/grade.py +40 -0
- mlebench/competitions/conways-reverse-game-of-life-2020/prepare.py +41 -0
- mlebench/competitions/demand-forecasting-kernels-only/__init__.py +0 -0
- mlebench/competitions/demand-forecasting-kernels-only/grade.py +66 -0
- mlebench/competitions/demand-forecasting-kernels-only/prepare.py +27 -0
- mlebench/competitions/demand_forecasting_kernels_only/__init__.py +0 -0
- mlebench/competitions/demand_forecasting_kernels_only/grade.py +66 -0
- mlebench/competitions/demand_forecasting_kernels_only/prepare.py +27 -0
- mlebench/competitions/denoising-dirty-documents/grade.py +44 -0
- mlebench/competitions/denoising-dirty-documents/prepare.py +134 -0
- mlebench/competitions/denoising-dirty-documents/prepare_val.py +178 -0
- mlebench/competitions/detecting-insults-in-social-commentary/grade.py +11 -0
- mlebench/competitions/detecting-insults-in-social-commentary/prepare.py +72 -0
- mlebench/competitions/detecting-insults-in-social-commentary/prepare_val.py +128 -0
- mlebench/competitions/dog-breed-identification/dogs.py +124 -0
- mlebench/competitions/dog-breed-identification/grade.py +42 -0
- mlebench/competitions/dog-breed-identification/prepare.py +55 -0
- mlebench/competitions/dog-breed-identification/prepare_val.py +104 -0
- mlebench/competitions/dogs-vs-cats-redux-kernels-edition/grade.py +43 -0
- mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare.py +70 -0
- mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare_val.py +143 -0
- mlebench/competitions/ethanol-concentration/grade.py +23 -0
- mlebench/competitions/ethanol-concentration/prepare.py +90 -0
- mlebench/competitions/facebook-recruiting-iii-keyword-extraction/grade.py +60 -0
- mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare.py +41 -0
- mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare_val.py +92 -0
- mlebench/competitions/feedback-prize-english-language-learning/__init__.py +0 -0
- mlebench/competitions/feedback-prize-english-language-learning/grade.py +60 -0
- mlebench/competitions/feedback-prize-english-language-learning/prepare.py +39 -0
- mlebench/competitions/freesound-audio-tagging-2019/grade.py +64 -0
- mlebench/competitions/freesound-audio-tagging-2019/prepare.py +94 -0
- mlebench/competitions/freesound-audio-tagging-2019/prepare_val.py +175 -0
- mlebench/competitions/freesound-audio-tagging-2019/vocabulary.py +83 -0
- mlebench/competitions/google-quest-challenge/classes.py +32 -0
- mlebench/competitions/google-quest-challenge/grade.py +45 -0
- mlebench/competitions/google-quest-challenge/prepare.py +58 -0
- mlebench/competitions/google-quest-challenge/prepare_val.py +120 -0
- mlebench/competitions/google-research-identify-contrails-reduce-global-warming/grade.py +77 -0
- mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare.py +155 -0
- mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare_val.py +211 -0
- mlebench/competitions/h-and-m-personalized-fashion-recommendations/grade.py +42 -0
- mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare.py +102 -0
- mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare_val.py +132 -0
- mlebench/competitions/handwriting/grade.py +23 -0
- mlebench/competitions/handwriting/prepare.py +179 -0
- mlebench/competitions/herbarium-2020-fgvc7/grade.py +34 -0
- mlebench/competitions/herbarium-2020-fgvc7/prepare.py +251 -0
- mlebench/competitions/herbarium-2020-fgvc7/prepare_val.py +242 -0
- mlebench/competitions/herbarium-2021-fgvc8/grade.py +34 -0
- mlebench/competitions/herbarium-2021-fgvc8/prepare.py +251 -0
- mlebench/competitions/herbarium-2021-fgvc8/prepare_val.py +222 -0
- mlebench/competitions/herbarium-2022-fgvc9/grade.py +31 -0
- mlebench/competitions/herbarium-2022-fgvc9/prepare.py +233 -0
- mlebench/competitions/herbarium-2022-fgvc9/prepare_val.py +213 -0
- mlebench/competitions/histopathologic-cancer-detection/grade.py +12 -0
- mlebench/competitions/histopathologic-cancer-detection/prepare.py +59 -0
- mlebench/competitions/histopathologic-cancer-detection/prepare_val.py +131 -0
- mlebench/competitions/hms-harmful-brain-activity-classification/constants.py +9 -0
- mlebench/competitions/hms-harmful-brain-activity-classification/grade.py +43 -0
- mlebench/competitions/hms-harmful-brain-activity-classification/kaggle_metric_utilities.py +96 -0
- mlebench/competitions/hms-harmful-brain-activity-classification/kullback_leibler_divergence.py +118 -0
- mlebench/competitions/hms-harmful-brain-activity-classification/prepare.py +121 -0
- mlebench/competitions/hms-harmful-brain-activity-classification/prepare_val.py +190 -0
- mlebench/competitions/hotel-id-2021-fgvc8/grade.py +41 -0
- mlebench/competitions/hotel-id-2021-fgvc8/prepare.py +63 -0
- mlebench/competitions/hotel-id-2021-fgvc8/prepare_val.py +132 -0
- mlebench/competitions/hubmap-kidney-segmentation/grade.py +62 -0
- mlebench/competitions/hubmap-kidney-segmentation/prepare.py +108 -0
- mlebench/competitions/hubmap-kidney-segmentation/prepare_val.py +153 -0
- mlebench/competitions/icecube-neutrinos-in-deep-ice/grade.py +111 -0
- mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare.py +127 -0
- mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare_val.py +183 -0
- mlebench/competitions/ili/grade.py +60 -0
- mlebench/competitions/ili/prepare.py +99 -0
- mlebench/competitions/imet-2020-fgvc7/grade.py +54 -0
- mlebench/competitions/imet-2020-fgvc7/prepare.py +77 -0
- mlebench/competitions/imet-2020-fgvc7/prepare_val.py +157 -0
- mlebench/competitions/inaturalist-2019-fgvc6/grade.py +35 -0
- mlebench/competitions/inaturalist-2019-fgvc6/prepare.py +259 -0
- mlebench/competitions/inaturalist-2019-fgvc6/prepare_val.py +304 -0
- mlebench/competitions/instant-gratification/__init__.py +0 -0
- mlebench/competitions/instant-gratification/grade.py +55 -0
- mlebench/competitions/instant-gratification/prepare.py +25 -0
- mlebench/competitions/instant_gratification/__init__.py +0 -0
- mlebench/competitions/instant_gratification/grade.py +55 -0
- mlebench/competitions/instant_gratification/prepare.py +25 -0
- mlebench/competitions/invasive-species-monitoring/grade.py +11 -0
- mlebench/competitions/invasive-species-monitoring/prepare.py +97 -0
- mlebench/competitions/invasive-species-monitoring/prepare_val.py +164 -0
- mlebench/competitions/iwildcam-2019-fgvc6/grade.py +44 -0
- mlebench/competitions/iwildcam-2019-fgvc6/prepare.py +118 -0
- mlebench/competitions/iwildcam-2019-fgvc6/prepare_val.py +194 -0
- mlebench/competitions/iwildcam-2020-fgvc7/grade.py +11 -0
- mlebench/competitions/iwildcam-2020-fgvc7/prepare.py +164 -0
- mlebench/competitions/iwildcam-2020-fgvc7/prepare_val.py +245 -0
- mlebench/competitions/jigsaw-toxic-comment-classification-challenge/classes.py +1 -0
- mlebench/competitions/jigsaw-toxic-comment-classification-challenge/grade.py +54 -0
- mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare.py +42 -0
- mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare_val.py +88 -0
- mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/grade.py +153 -0
- mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare.py +36 -0
- mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare_val.py +117 -0
- mlebench/competitions/kuzushiji-recognition/grade.py +58 -0
- mlebench/competitions/kuzushiji-recognition/kuzushiji_metric.py +118 -0
- mlebench/competitions/kuzushiji-recognition/prepare.py +92 -0
- mlebench/competitions/kuzushiji-recognition/prepare_val.py +149 -0
- mlebench/competitions/leaf-classification/classes.py +101 -0
- mlebench/competitions/leaf-classification/grade.py +44 -0
- mlebench/competitions/leaf-classification/prepare.py +60 -0
- mlebench/competitions/leaf-classification/prepare_val.py +116 -0
- mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/grade.py +44 -0
- mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare.py +51 -0
- mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare_val.py +96 -0
- mlebench/competitions/liverpool-ion-switching/__init__.py +0 -0
- mlebench/competitions/liverpool-ion-switching/grade.py +52 -0
- mlebench/competitions/liverpool-ion-switching/prepare.py +27 -0
- mlebench/competitions/liverpool_ion_switching/__init__.py +0 -0
- mlebench/competitions/liverpool_ion_switching/grade.py +52 -0
- mlebench/competitions/liverpool_ion_switching/prepare.py +27 -0
- mlebench/competitions/lmsys-chatbot-arena/grade.py +63 -0
- mlebench/competitions/lmsys-chatbot-arena/prepare.py +52 -0
- mlebench/competitions/lmsys-chatbot-arena/prepare_val.py +115 -0
- mlebench/competitions/mcm_2024_c_test/grade.py +107 -0
- mlebench/competitions/mcm_2024_c_test/prepare.py +2 -0
- mlebench/competitions/ml2021spring-hw2/grade.py +11 -0
- mlebench/competitions/ml2021spring-hw2/prepare.py +58 -0
- mlebench/competitions/ml2021spring-hw2/prepare_val.py +135 -0
- mlebench/competitions/mlsp-2013-birds/grade.py +11 -0
- mlebench/competitions/mlsp-2013-birds/prepare.py +182 -0
- mlebench/competitions/mlsp-2013-birds/prepare_val.py +241 -0
- mlebench/competitions/movie-review-sentiment-analysis-kernels-only/grade.py +11 -0
- mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare.py +58 -0
- mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare_val.py +120 -0
- mlebench/competitions/multi-modal-gesture-recognition/grade.py +58 -0
- mlebench/competitions/multi-modal-gesture-recognition/prepare.py +85 -0
- mlebench/competitions/multi-modal-gesture-recognition/prepare_val.py +139 -0
- mlebench/competitions/my-custom-task-01/prepare.py +2 -0
- mlebench/competitions/new-my-task-01/prepare.py +2 -0
- mlebench/competitions/new-my-task-03/grade.py +107 -0
- mlebench/competitions/new-my-task-03/prepare.py +2 -0
- mlebench/competitions/new-york-city-taxi-fare-prediction/grade.py +28 -0
- mlebench/competitions/new-york-city-taxi-fare-prediction/prepare.py +44 -0
- mlebench/competitions/new-york-city-taxi-fare-prediction/prepare_val.py +89 -0
- mlebench/competitions/nfl-player-contact-detection/grade.py +36 -0
- mlebench/competitions/nfl-player-contact-detection/prepare.py +101 -0
- mlebench/competitions/nfl-player-contact-detection/prepare_val.py +186 -0
- mlebench/competitions/nomad2018-predict-transparent-conductors/grade.py +47 -0
- mlebench/competitions/nomad2018-predict-transparent-conductors/prepare.py +77 -0
- mlebench/competitions/nomad2018-predict-transparent-conductors/prepare_val.py +144 -0
- mlebench/competitions/osic-pulmonary-fibrosis-progression/grade.py +74 -0
- mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare.py +95 -0
- mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare_val.py +167 -0
- mlebench/competitions/paddy-disease-classification/grade.py +35 -0
- mlebench/competitions/paddy-disease-classification/prepare.py +69 -0
- mlebench/competitions/paddy-disease-classification/prepare_val.py +122 -0
- mlebench/competitions/petfinder-pawpularity-score/grade.py +41 -0
- mlebench/competitions/petfinder-pawpularity-score/prepare.py +76 -0
- mlebench/competitions/petfinder-pawpularity-score/prepare_val.py +154 -0
- mlebench/competitions/plant-pathology-2020-fgvc7/grade.py +41 -0
- mlebench/competitions/plant-pathology-2020-fgvc7/prepare.py +74 -0
- mlebench/competitions/plant-pathology-2020-fgvc7/prepare_val.py +160 -0
- mlebench/competitions/plant-pathology-2021-fgvc8/grade.py +54 -0
- mlebench/competitions/plant-pathology-2021-fgvc8/prepare.py +65 -0
- mlebench/competitions/plant-pathology-2021-fgvc8/prepare_val.py +130 -0
- mlebench/competitions/plant-seedlings-classification/grade.py +39 -0
- mlebench/competitions/plant-seedlings-classification/prepare.py +91 -0
- mlebench/competitions/plant-seedlings-classification/prepare_val.py +158 -0
- mlebench/competitions/playground-series-s3e1/__init__.py +0 -0
- mlebench/competitions/playground-series-s3e1/grade.py +52 -0
- mlebench/competitions/playground-series-s3e1/prepare.py +25 -0
- mlebench/competitions/playground-series-s3e11/__init__.py +0 -0
- mlebench/competitions/playground-series-s3e11/grade.py +55 -0
- mlebench/competitions/playground-series-s3e11/prepare.py +25 -0
- mlebench/competitions/playground-series-s3e18/grade.py +39 -0
- mlebench/competitions/playground-series-s3e18/prepare.py +36 -0
- mlebench/competitions/playground-series-s3e18/prepare_val.py +89 -0
- mlebench/competitions/playground_series_s3e1/__init__.py +0 -0
- mlebench/competitions/playground_series_s3e1/grade.py +52 -0
- mlebench/competitions/playground_series_s3e1/prepare.py +25 -0
- mlebench/competitions/playground_series_s3e11/__init__.py +0 -0
- mlebench/competitions/playground_series_s3e11/grade.py +55 -0
- mlebench/competitions/playground_series_s3e11/prepare.py +25 -0
- mlebench/competitions/predict-volcanic-eruptions-ingv-oe/grade.py +44 -0
- mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare.py +68 -0
- mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare_val.py +146 -0
- mlebench/competitions/random-acts-of-pizza/grade.py +14 -0
- mlebench/competitions/random-acts-of-pizza/prepare.py +80 -0
- mlebench/competitions/random-acts-of-pizza/prepare_val.py +144 -0
- mlebench/competitions/ranzcr-clip-catheter-line-classification/classes.py +11 -0
- mlebench/competitions/ranzcr-clip-catheter-line-classification/grade.py +31 -0
- mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare.py +53 -0
- mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare_val.py +113 -0
- mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/grade.py +124 -0
- mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare.py +219 -0
- mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare_val.py +257 -0
- mlebench/competitions/rsna-breast-cancer-detection/grade.py +65 -0
- mlebench/competitions/rsna-breast-cancer-detection/prepare.py +141 -0
- mlebench/competitions/rsna-breast-cancer-detection/prepare_val.py +201 -0
- mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/grade.py +13 -0
- mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare.py +47 -0
- mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare_val.py +97 -0
- mlebench/competitions/santander-customer-satisfaction/grade.py +10 -0
- mlebench/competitions/santander-customer-satisfaction/prepare.py +41 -0
- mlebench/competitions/sciencebench-001-clintox-nn/__init__.py +0 -0
- mlebench/competitions/sciencebench-001-clintox-nn/grade.py +56 -0
- mlebench/competitions/sciencebench-001-clintox-nn/prepare.py +75 -0
- mlebench/competitions/sciencebench-015-aai/grade.py +37 -0
- mlebench/competitions/sciencebench-015-aai/prepare.py +102 -0
- mlebench/competitions/sciencebench-051-brain-blood-qsar/grade.py +58 -0
- mlebench/competitions/sciencebench-051-brain-blood-qsar/prepare.py +69 -0
- mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/grade.py +55 -0
- mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/prepare.py +88 -0
- mlebench/competitions/see-click-predict-fix/__init__.py +0 -0
- mlebench/competitions/see-click-predict-fix/grade.py +66 -0
- mlebench/competitions/see-click-predict-fix/prepare.py +25 -0
- mlebench/competitions/see_click_predict_fix/__init__.py +0 -0
- mlebench/competitions/see_click_predict_fix/grade.py +66 -0
- mlebench/competitions/see_click_predict_fix/prepare.py +25 -0
- mlebench/competitions/seti-breakthrough-listen/grade.py +11 -0
- mlebench/competitions/seti-breakthrough-listen/prepare.py +71 -0
- mlebench/competitions/seti-breakthrough-listen/prepare_val.py +159 -0
- mlebench/competitions/siim-covid19-detection/grade.py +194 -0
- mlebench/competitions/siim-covid19-detection/prepare.py +123 -0
- mlebench/competitions/siim-covid19-detection/prepare_val.py +164 -0
- mlebench/competitions/siim-isic-melanoma-classification/grade.py +11 -0
- mlebench/competitions/siim-isic-melanoma-classification/prepare.py +127 -0
- mlebench/competitions/siim-isic-melanoma-classification/prepare_val.py +158 -0
- mlebench/competitions/smartphone-decimeter-2022/grade.py +55 -0
- mlebench/competitions/smartphone-decimeter-2022/notebook.py +86 -0
- mlebench/competitions/smartphone-decimeter-2022/prepare.py +143 -0
- mlebench/competitions/smartphone-decimeter-2022/prepare_val.py +199 -0
- mlebench/competitions/spaceship-titanic/grade.py +11 -0
- mlebench/competitions/spaceship-titanic/prepare.py +23 -0
- mlebench/competitions/spaceship-titanic/prepare_val.py +61 -0
- mlebench/competitions/spooky-author-identification/classes.py +1 -0
- mlebench/competitions/spooky-author-identification/grade.py +38 -0
- mlebench/competitions/spooky-author-identification/prepare.py +40 -0
- mlebench/competitions/spooky-author-identification/prepare_val.py +78 -0
- mlebench/competitions/stanford-covid-vaccine/grade.py +65 -0
- mlebench/competitions/stanford-covid-vaccine/prepare.py +129 -0
- mlebench/competitions/stanford-covid-vaccine/prepare_val.py +199 -0
- mlebench/competitions/statoil-iceberg-classifier-challenge/grade.py +41 -0
- mlebench/competitions/statoil-iceberg-classifier-challenge/prepare.py +105 -0
- mlebench/competitions/statoil-iceberg-classifier-challenge/prepare_val.py +157 -0
- mlebench/competitions/tabular-playground-series-dec-2021/grade.py +11 -0
- mlebench/competitions/tabular-playground-series-dec-2021/prepare.py +39 -0
- mlebench/competitions/tabular-playground-series-dec-2021/prepare_val.py +99 -0
- mlebench/competitions/tabular-playground-series-may-2022/grade.py +9 -0
- mlebench/competitions/tabular-playground-series-may-2022/prepare.py +56 -0
- mlebench/competitions/tabular-playground-series-may-2022/prepare_val.py +116 -0
- mlebench/competitions/tensorflow-speech-recognition-challenge/grade.py +11 -0
- mlebench/competitions/tensorflow-speech-recognition-challenge/prepare.py +90 -0
- mlebench/competitions/tensorflow-speech-recognition-challenge/prepare_val.py +148 -0
- mlebench/competitions/tensorflow2-question-answering/grade.py +122 -0
- mlebench/competitions/tensorflow2-question-answering/prepare.py +122 -0
- mlebench/competitions/tensorflow2-question-answering/prepare_val.py +187 -0
- mlebench/competitions/text-normalization-challenge-english-language/grade.py +49 -0
- mlebench/competitions/text-normalization-challenge-english-language/prepare.py +115 -0
- mlebench/competitions/text-normalization-challenge-english-language/prepare_val.py +213 -0
- mlebench/competitions/text-normalization-challenge-russian-language/grade.py +49 -0
- mlebench/competitions/text-normalization-challenge-russian-language/prepare.py +113 -0
- mlebench/competitions/text-normalization-challenge-russian-language/prepare_val.py +165 -0
- mlebench/competitions/tgs-salt-identification-challenge/grade.py +144 -0
- mlebench/competitions/tgs-salt-identification-challenge/prepare.py +158 -0
- mlebench/competitions/tgs-salt-identification-challenge/prepare_val.py +166 -0
- mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/grade.py +11 -0
- mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare.py +95 -0
- mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare_val.py +141 -0
- mlebench/competitions/tmdb-box-office-prediction/__init__.py +0 -0
- mlebench/competitions/tmdb-box-office-prediction/grade.py +55 -0
- mlebench/competitions/tmdb-box-office-prediction/prepare.py +35 -0
- mlebench/competitions/tweet-sentiment-extraction/grade.py +67 -0
- mlebench/competitions/tweet-sentiment-extraction/prepare.py +36 -0
- mlebench/competitions/tweet-sentiment-extraction/prepare_val.py +106 -0
- mlebench/competitions/us-patent-phrase-to-phrase-matching/grade.py +31 -0
- mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare.py +33 -0
- mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare_val.py +71 -0
- mlebench/competitions/utils.py +266 -0
- mlebench/competitions/uw-madison-gi-tract-image-segmentation/grade.py +158 -0
- mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare.py +139 -0
- mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare_val.py +193 -0
- mlebench/competitions/ventilator-pressure-prediction/__init__.py +0 -0
- mlebench/competitions/ventilator-pressure-prediction/grade.py +52 -0
- mlebench/competitions/ventilator-pressure-prediction/prepare.py +27 -0
- mlebench/competitions/ventilator-pressure-prediction/prepare_val.py +142 -0
- mlebench/competitions/ventilator_pressure_prediction/__init__.py +0 -0
- mlebench/competitions/ventilator_pressure_prediction/grade.py +52 -0
- mlebench/competitions/ventilator_pressure_prediction/prepare.py +27 -0
- mlebench/competitions/vesuvius-challenge-ink-detection/grade.py +97 -0
- mlebench/competitions/vesuvius-challenge-ink-detection/prepare.py +122 -0
- mlebench/competitions/vesuvius-challenge-ink-detection/prepare_val.py +170 -0
- mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/grade.py +220 -0
- mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare.py +129 -0
- mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare_val.py +204 -0
- mlebench/competitions/whale-categorization-playground/grade.py +41 -0
- mlebench/competitions/whale-categorization-playground/prepare.py +103 -0
- mlebench/competitions/whale-categorization-playground/prepare_val.py +196 -0
- mlebench/data.py +420 -0
- mlebench/grade.py +209 -0
- mlebench/grade_helpers.py +235 -0
- mlebench/metrics.py +75 -0
- mlebench/registry.py +332 -0
- mlebench/utils.py +346 -0
- {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/WHEEL +0 -0
- {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import shutil
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
from sklearn.model_selection import train_test_split
|
|
6
|
+
from tqdm.auto import tqdm
|
|
7
|
+
|
|
8
|
+
from mlebench.utils import read_csv
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def prepare(raw: Path, public: Path, private: Path):
|
|
12
|
+
|
|
13
|
+
old_train = read_csv(raw / "train.csv")
|
|
14
|
+
|
|
15
|
+
np_rng = np.random.default_rng(0)
|
|
16
|
+
|
|
17
|
+
# Original ratio: 6800/(9912 + 6800) = ~ 0.41 test_size
|
|
18
|
+
# We use 0.1 ratio to avoid taking out too many samples from train
|
|
19
|
+
new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
|
|
20
|
+
|
|
21
|
+
new_test_without_labels = new_test.drop(columns=["Pawpularity"])
|
|
22
|
+
|
|
23
|
+
# random floats between 1 and 100 inclusive, with 2 decimal places
|
|
24
|
+
sample_submission = new_test[["Id", "Pawpularity"]].copy()
|
|
25
|
+
sample_submission["Pawpularity"] = np_rng.uniform(1, 100, len(sample_submission)).round(2)
|
|
26
|
+
|
|
27
|
+
new_train.to_csv(public / "train.csv", index=False)
|
|
28
|
+
new_test.to_csv(private / "test.csv", index=False)
|
|
29
|
+
new_test_without_labels.to_csv(public / "test.csv", index=False)
|
|
30
|
+
sample_submission.to_csv(public / "sample_submission.csv", index=False)
|
|
31
|
+
|
|
32
|
+
(public / "train").mkdir(exist_ok=True)
|
|
33
|
+
for img_id in tqdm(new_train["Id"], desc="Copying train images", total=len(new_train)):
|
|
34
|
+
shutil.copy(raw / "train" / f"{img_id}.jpg", public / "train" / f"{img_id}.jpg")
|
|
35
|
+
|
|
36
|
+
(public / "test").mkdir(exist_ok=True)
|
|
37
|
+
for img_id in tqdm(
|
|
38
|
+
new_test_without_labels["Id"],
|
|
39
|
+
desc="Copying test images",
|
|
40
|
+
total=len(new_test_without_labels),
|
|
41
|
+
):
|
|
42
|
+
shutil.copy(raw / "train" / f"{img_id}.jpg", public / "test" / f"{img_id}.jpg")
|
|
43
|
+
|
|
44
|
+
# checks
|
|
45
|
+
assert len(new_train) + len(new_test) == len(
|
|
46
|
+
old_train
|
|
47
|
+
), "Train and test length should sum to the original train length"
|
|
48
|
+
assert len(sample_submission) == len(
|
|
49
|
+
new_test
|
|
50
|
+
), "Sample submission should have the same length as the test set"
|
|
51
|
+
|
|
52
|
+
assert (
|
|
53
|
+
new_train.columns.tolist() == old_train.columns.tolist()
|
|
54
|
+
), "Old and new train columns should match"
|
|
55
|
+
assert (
|
|
56
|
+
new_test_without_labels.columns.tolist() == new_train.columns.tolist()[:-1]
|
|
57
|
+
), "Public test columns should match train columns, minus the target column"
|
|
58
|
+
assert (
|
|
59
|
+
new_test.columns.tolist() == new_train.columns.tolist()
|
|
60
|
+
), "Private test columns should match train columns"
|
|
61
|
+
assert sample_submission.columns.tolist() == [
|
|
62
|
+
"Id",
|
|
63
|
+
"Pawpularity",
|
|
64
|
+
], "Sample submission columns should be Id, Pawpularity"
|
|
65
|
+
|
|
66
|
+
assert set(new_train["Id"]).isdisjoint(
|
|
67
|
+
set(new_test["Id"])
|
|
68
|
+
), "Train and test ids should not overlap"
|
|
69
|
+
|
|
70
|
+
# check copy was successful
|
|
71
|
+
assert len(list((public / "train").glob("*.jpg"))) == len(
|
|
72
|
+
new_train
|
|
73
|
+
), "Train images should match the train set"
|
|
74
|
+
assert len(list((public / "test").glob("*.jpg"))) == len(
|
|
75
|
+
new_test
|
|
76
|
+
), "Test images should match the test set"
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
import shutil
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from sklearn.model_selection import train_test_split
|
|
7
|
+
from tqdm.auto import tqdm
|
|
8
|
+
|
|
9
|
+
from mlebench.utils import read_csv
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _process_split(
|
|
13
|
+
source_df: pd.DataFrame,
|
|
14
|
+
public_dir: Path,
|
|
15
|
+
private_dir: Path,
|
|
16
|
+
raw_images_dir: Path,
|
|
17
|
+
test_size: float,
|
|
18
|
+
random_state: int,
|
|
19
|
+
):
|
|
20
|
+
"""
|
|
21
|
+
Processes a single data split, creating train/test sets and associated files.
|
|
22
|
+
|
|
23
|
+
This helper function encapsulates the logic for:
|
|
24
|
+
1. Splitting a dataframe into train and test sets.
|
|
25
|
+
2. Creating public and private directories.
|
|
26
|
+
3. Saving train.csv, test.csv (public), test.csv (private), and sample_submission.csv.
|
|
27
|
+
4. Copying the corresponding images.
|
|
28
|
+
5. Running assertions to verify the split.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
source_df (pd.DataFrame): The dataframe to be split.
|
|
32
|
+
public_dir (Path): The public output directory.
|
|
33
|
+
private_dir (Path): The private output directory.
|
|
34
|
+
raw_images_dir (Path): The directory containing the source raw images.
|
|
35
|
+
test_size (float): The proportion of the dataset to allocate to the test split.
|
|
36
|
+
random_state (int): The random state for reproducibility.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
pd.DataFrame: The train portion of the split dataframe.
|
|
40
|
+
"""
|
|
41
|
+
# Create output directories
|
|
42
|
+
public_dir.mkdir(exist_ok=True, parents=True)
|
|
43
|
+
private_dir.mkdir(exist_ok=True, parents=True)
|
|
44
|
+
|
|
45
|
+
# Perform the split
|
|
46
|
+
train_df, test_df = train_test_split(
|
|
47
|
+
source_df, test_size=test_size, random_state=random_state
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
test_df_without_labels = test_df.drop(columns=["Pawpularity"])
|
|
51
|
+
|
|
52
|
+
# Create a sample submission file
|
|
53
|
+
np_rng = np.random.default_rng(random_state)
|
|
54
|
+
sample_submission = test_df[["Id", "Pawpularity"]].copy()
|
|
55
|
+
sample_submission["Pawpularity"] = np_rng.uniform(1, 100, len(sample_submission)).round(2)
|
|
56
|
+
|
|
57
|
+
# Save CSV files
|
|
58
|
+
train_df.to_csv(public_dir / "train.csv", index=False)
|
|
59
|
+
test_df.to_csv(private_dir / "test.csv", index=False)
|
|
60
|
+
test_df_without_labels.to_csv(public_dir / "test.csv", index=False)
|
|
61
|
+
sample_submission.to_csv(public_dir / "sample_submission.csv", index=False)
|
|
62
|
+
|
|
63
|
+
# Copy train images
|
|
64
|
+
(public_dir / "train").mkdir(exist_ok=True)
|
|
65
|
+
for img_id in tqdm(
|
|
66
|
+
train_df["Id"], desc=f"Copying train images to {public_dir.name}", total=len(train_df)
|
|
67
|
+
):
|
|
68
|
+
shutil.copy(raw_images_dir / f"{img_id}.jpg", public_dir / "train" / f"{img_id}.jpg")
|
|
69
|
+
|
|
70
|
+
# Copy test images
|
|
71
|
+
(public_dir / "test").mkdir(exist_ok=True)
|
|
72
|
+
for img_id in tqdm(
|
|
73
|
+
test_df_without_labels["Id"],
|
|
74
|
+
desc=f"Copying test images to {public_dir.name}",
|
|
75
|
+
total=len(test_df_without_labels),
|
|
76
|
+
):
|
|
77
|
+
shutil.copy(raw_images_dir / f"{img_id}.jpg", public_dir / "test" / f"{img_id}.jpg")
|
|
78
|
+
|
|
79
|
+
# checks
|
|
80
|
+
assert len(train_df) + len(test_df) == len(
|
|
81
|
+
source_df
|
|
82
|
+
), "Train and test length should sum to the source df length"
|
|
83
|
+
assert len(sample_submission) == len(
|
|
84
|
+
test_df
|
|
85
|
+
), "Sample submission should have the same length as the test set"
|
|
86
|
+
assert (
|
|
87
|
+
train_df.columns.tolist() == source_df.columns.tolist()
|
|
88
|
+
), "Train columns should match source columns"
|
|
89
|
+
assert (
|
|
90
|
+
test_df_without_labels.columns.tolist() == train_df.columns.tolist()[:-1]
|
|
91
|
+
), "Public test columns should match train columns, minus the target column"
|
|
92
|
+
assert (
|
|
93
|
+
test_df.columns.tolist() == train_df.columns.tolist()
|
|
94
|
+
), "Private test columns should match train columns"
|
|
95
|
+
assert sample_submission.columns.tolist() == [
|
|
96
|
+
"Id",
|
|
97
|
+
"Pawpularity",
|
|
98
|
+
], "Sample submission columns should be Id, Pawpularity"
|
|
99
|
+
assert set(train_df["Id"]).isdisjoint(
|
|
100
|
+
set(test_df["Id"])
|
|
101
|
+
), "Train and test ids should not overlap"
|
|
102
|
+
assert len(list((public_dir / "train").glob("*.jpg"))) == len(
|
|
103
|
+
train_df
|
|
104
|
+
), "Train images should match the train set"
|
|
105
|
+
assert len(list((public_dir / "test").glob("*.jpg"))) == len(
|
|
106
|
+
test_df
|
|
107
|
+
), "Test images should match the test set"
|
|
108
|
+
|
|
109
|
+
return train_df
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def prepare(raw: Path, public: Path, private: Path):
|
|
113
|
+
|
|
114
|
+
old_train = read_csv(raw / "train.csv")
|
|
115
|
+
raw_images_dir = raw / "train"
|
|
116
|
+
|
|
117
|
+
# --- First Split: Create the original train/test sets ---
|
|
118
|
+
# This split creates the main `public` and `private` directories.
|
|
119
|
+
# Its outputs must remain identical to the original script.
|
|
120
|
+
# Original ratio: 6800/(9912 + 6800) = ~ 0.41 test_size
|
|
121
|
+
# We use 0.1 ratio to avoid taking out too many samples from train
|
|
122
|
+
original_test_size = 0.1
|
|
123
|
+
train_from_first_split = _process_split(
|
|
124
|
+
source_df=old_train,
|
|
125
|
+
public_dir=public,
|
|
126
|
+
private_dir=private,
|
|
127
|
+
raw_images_dir=raw_images_dir,
|
|
128
|
+
test_size=original_test_size,
|
|
129
|
+
random_state=0,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# --- Second Split: Create the new validation sets from the first split's train set ---
|
|
133
|
+
# Define new paths for the validation set outputs.
|
|
134
|
+
public_val = public.parent / "public_val"
|
|
135
|
+
private_val = private.parent / "private_val"
|
|
136
|
+
|
|
137
|
+
# Calculate the test size for the second split to make the new test_val set
|
|
138
|
+
# have approximately the same size as the original test set.
|
|
139
|
+
# size(test_val) = size(test_original)
|
|
140
|
+
# test_size_val * size(train_from_first_split) = original_test_size * size(old_train)
|
|
141
|
+
# test_size_val * (1 - original_test_size) * size(old_train) = original_test_size * size(old_train)
|
|
142
|
+
# test_size_val = original_test_size / (1 - original_test_size)
|
|
143
|
+
val_test_size = original_test_size / (1 - original_test_size)
|
|
144
|
+
|
|
145
|
+
# This split creates `public_val` and `private_val` directories.
|
|
146
|
+
# The random_state is kept the same for consistency.
|
|
147
|
+
_process_split(
|
|
148
|
+
source_df=train_from_first_split,
|
|
149
|
+
public_dir=public_val,
|
|
150
|
+
private_dir=private_val,
|
|
151
|
+
raw_images_dir=raw_images_dir,
|
|
152
|
+
test_size=val_test_size,
|
|
153
|
+
random_state=0,
|
|
154
|
+
)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from sklearn.metrics import roc_auc_score
|
|
4
|
+
|
|
5
|
+
from mlebench.grade_helpers import InvalidSubmissionError
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
|
|
9
|
+
expected_answers_columns = ["image_id", "healthy", "multiple_diseases", "rust", "scab"]
|
|
10
|
+
expected_sample_submission_columns = expected_answers_columns
|
|
11
|
+
|
|
12
|
+
assert set(answers.columns) == set(
|
|
13
|
+
expected_answers_columns
|
|
14
|
+
), f"Expected `answers` to have columns {expected_answers_columns} but got {answers.columns}"
|
|
15
|
+
|
|
16
|
+
if len(submission) != len(answers):
|
|
17
|
+
raise InvalidSubmissionError(
|
|
18
|
+
f"Expected submission to have {len(answers)} rows but got {len(submission)}"
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
if set(submission.columns) != set(expected_sample_submission_columns):
|
|
22
|
+
raise InvalidSubmissionError(
|
|
23
|
+
f"Expected submission to have columns {expected_sample_submission_columns} but got {submission.columns}"
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
submission_sorted = submission.sort_values(by="image_id").reset_index(drop=True)
|
|
27
|
+
answers_sorted = answers.sort_values(by="image_id").reset_index(drop=True)
|
|
28
|
+
|
|
29
|
+
if not all(submission_sorted["image_id"] == answers_sorted["image_id"]):
|
|
30
|
+
raise InvalidSubmissionError(
|
|
31
|
+
"Submission and answers DataFrames must have matching 'image_id' columns."
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
class_columns = ["healthy", "multiple_diseases", "rust", "scab"]
|
|
35
|
+
scores = []
|
|
36
|
+
|
|
37
|
+
for cls in class_columns:
|
|
38
|
+
score = roc_auc_score(y_true=answers_sorted[cls], y_score=submission_sorted[cls])
|
|
39
|
+
scores.append(score)
|
|
40
|
+
|
|
41
|
+
return np.mean(scores)
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import shutil
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from sklearn.model_selection import train_test_split
|
|
5
|
+
from tqdm.auto import tqdm
|
|
6
|
+
|
|
7
|
+
from mlebench.utils import read_csv
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def prepare(raw: Path, public: Path, private: Path) -> None:
|
|
11
|
+
expected_train_columns = ["image_id", "healthy", "multiple_diseases", "rust", "scab"]
|
|
12
|
+
expected_test_columns = ["image_id"]
|
|
13
|
+
expected_answers_columns = expected_train_columns
|
|
14
|
+
expected_sample_submission_columns = expected_train_columns
|
|
15
|
+
|
|
16
|
+
old_train = read_csv(raw / "train.csv")
|
|
17
|
+
new_train, answers = train_test_split(old_train, test_size=0.1, random_state=0)
|
|
18
|
+
|
|
19
|
+
assert set(new_train.columns) == set(
|
|
20
|
+
expected_train_columns
|
|
21
|
+
), f"Expected `new_train` to have columns {expected_train_columns} but got {new_train.columns}"
|
|
22
|
+
|
|
23
|
+
assert set(answers.columns) == set(
|
|
24
|
+
expected_answers_columns
|
|
25
|
+
), f"Expected `answers` to have columns {expected_answers_columns} but got {answers.columns}"
|
|
26
|
+
|
|
27
|
+
new_train_image_ids = new_train["image_id"].unique()
|
|
28
|
+
new_test_image_ids = answers["image_id"].unique()
|
|
29
|
+
to_new_image_id = {
|
|
30
|
+
**{old_id: f"Train_{i}" for i, old_id in enumerate(new_train_image_ids)},
|
|
31
|
+
**{old_id: f"Test_{i}" for i, old_id in enumerate(new_test_image_ids)},
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
new_train["image_id"] = new_train["image_id"].replace(to_new_image_id)
|
|
35
|
+
answers["image_id"] = answers["image_id"].replace(to_new_image_id)
|
|
36
|
+
|
|
37
|
+
new_test = answers[["image_id"]].copy()
|
|
38
|
+
|
|
39
|
+
assert set(new_test.columns) == set(
|
|
40
|
+
expected_test_columns
|
|
41
|
+
), f"Expected `new_test` to have columns {expected_test_columns} but got {new_test.columns}"
|
|
42
|
+
|
|
43
|
+
sample_submission = answers[["image_id"]].copy()
|
|
44
|
+
sample_submission[["healthy", "multiple_diseases", "rust", "scab"]] = 0.25
|
|
45
|
+
|
|
46
|
+
assert set(sample_submission.columns) == set(
|
|
47
|
+
expected_sample_submission_columns
|
|
48
|
+
), f"Expected `sample_submission` to have columns {expected_sample_submission_columns} but got {sample_submission.columns}"
|
|
49
|
+
|
|
50
|
+
private.mkdir(exist_ok=True, parents=True)
|
|
51
|
+
public.mkdir(exist_ok=True, parents=True)
|
|
52
|
+
(public / "images").mkdir(exist_ok=True)
|
|
53
|
+
|
|
54
|
+
for old_image_id in tqdm(old_train["image_id"], desc="Copying over train & test images"):
|
|
55
|
+
assert old_image_id.startswith(
|
|
56
|
+
"Train_"
|
|
57
|
+
), f"Expected train image id `{old_image_id}` to start with `Train_`."
|
|
58
|
+
|
|
59
|
+
new_image_id = to_new_image_id.get(old_image_id, old_image_id)
|
|
60
|
+
|
|
61
|
+
assert (
|
|
62
|
+
raw / "images" / f"{old_image_id}.jpg"
|
|
63
|
+
).exists(), f"Image `{old_image_id}.jpg` does not exist in `{raw / 'images'}`."
|
|
64
|
+
|
|
65
|
+
shutil.copyfile(
|
|
66
|
+
src=raw / "images" / f"{old_image_id}.jpg",
|
|
67
|
+
dst=public / "images" / f"{new_image_id}.jpg",
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
answers.to_csv(private / "test.csv", index=False)
|
|
71
|
+
|
|
72
|
+
sample_submission.to_csv(public / "sample_submission.csv", index=False)
|
|
73
|
+
new_test.to_csv(public / "test.csv", index=False)
|
|
74
|
+
new_train.to_csv(public / "train.csv", index=False)
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
import shutil
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from sklearn.model_selection import train_test_split
|
|
5
|
+
from tqdm.auto import tqdm
|
|
6
|
+
|
|
7
|
+
from mlebench.utils import read_csv
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def prepare(raw: Path, public: Path, private: Path) -> None:
|
|
11
|
+
# --- Expected Column Definitions (used for both splits) ---
|
|
12
|
+
expected_train_columns = ["image_id", "healthy", "multiple_diseases", "rust", "scab"]
|
|
13
|
+
expected_test_columns = ["image_id"]
|
|
14
|
+
expected_answers_columns = expected_train_columns
|
|
15
|
+
expected_sample_submission_columns = expected_train_columns
|
|
16
|
+
|
|
17
|
+
# =================================================================
|
|
18
|
+
# == STAGE 1: Create the original train/test split. ==
|
|
19
|
+
# == This section is preserved to ensure the original `public` ==
|
|
20
|
+
# == and `private` directories are identical to the original script. ==
|
|
21
|
+
# =================================================================
|
|
22
|
+
|
|
23
|
+
old_train = read_csv(raw / "train.csv")
|
|
24
|
+
new_train, answers = train_test_split(old_train, test_size=0.1, random_state=0)
|
|
25
|
+
|
|
26
|
+
assert set(new_train.columns) == set(
|
|
27
|
+
expected_train_columns
|
|
28
|
+
), f"Expected `new_train` to have columns {expected_train_columns} but got {new_train.columns}"
|
|
29
|
+
|
|
30
|
+
assert set(answers.columns) == set(
|
|
31
|
+
expected_answers_columns
|
|
32
|
+
), f"Expected `answers` to have columns {expected_answers_columns} but got {answers.columns}"
|
|
33
|
+
|
|
34
|
+
new_train_image_ids = new_train["image_id"].unique()
|
|
35
|
+
new_test_image_ids = answers["image_id"].unique()
|
|
36
|
+
to_new_image_id = {
|
|
37
|
+
**{old_id: f"Train_{i}" for i, old_id in enumerate(new_train_image_ids)},
|
|
38
|
+
**{old_id: f"Test_{i}" for i, old_id in enumerate(new_test_image_ids)},
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
# IMPORTANT: The `new_train` DataFrame is modified here and will be used
|
|
42
|
+
# as the input for the second split. We make a copy to preserve it
|
|
43
|
+
# before its image_ids are changed in-place for the first split's output.
|
|
44
|
+
train_for_val_split = new_train.copy()
|
|
45
|
+
new_train["image_id"] = new_train["image_id"].replace(to_new_image_id)
|
|
46
|
+
answers["image_id"] = answers["image_id"].replace(to_new_image_id)
|
|
47
|
+
|
|
48
|
+
new_test = answers[["image_id"]].copy()
|
|
49
|
+
|
|
50
|
+
assert set(new_test.columns) == set(
|
|
51
|
+
expected_test_columns
|
|
52
|
+
), f"Expected `new_test` to have columns {expected_test_columns} but got {new_test.columns}"
|
|
53
|
+
|
|
54
|
+
sample_submission = answers[["image_id"]].copy()
|
|
55
|
+
sample_submission[["healthy", "multiple_diseases", "rust", "scab"]] = 0.25
|
|
56
|
+
|
|
57
|
+
assert set(sample_submission.columns) == set(
|
|
58
|
+
expected_sample_submission_columns
|
|
59
|
+
), f"Expected `sample_submission` to have columns {expected_sample_submission_columns} but got {sample_submission.columns}"
|
|
60
|
+
|
|
61
|
+
private.mkdir(exist_ok=True, parents=True)
|
|
62
|
+
public.mkdir(exist_ok=True, parents=True)
|
|
63
|
+
(public / "images").mkdir(exist_ok=True)
|
|
64
|
+
|
|
65
|
+
# Note: This loop copies ALL images defined in the original raw train set.
|
|
66
|
+
for old_image_id in tqdm(old_train["image_id"], desc="Copying over train & test images"):
|
|
67
|
+
assert old_image_id.startswith(
|
|
68
|
+
"Train_"
|
|
69
|
+
), f"Expected train image id `{old_image_id}` to start with `Train_`."
|
|
70
|
+
|
|
71
|
+
new_image_id = to_new_image_id.get(old_image_id, old_image_id)
|
|
72
|
+
|
|
73
|
+
assert (
|
|
74
|
+
raw / "images" / f"{old_image_id}.jpg"
|
|
75
|
+
).exists(), f"Image `{old_image_id}.jpg` does not exist in `{raw / 'images'}`."
|
|
76
|
+
|
|
77
|
+
shutil.copyfile(
|
|
78
|
+
src=raw / "images" / f"{old_image_id}.jpg",
|
|
79
|
+
dst=public / "images" / f"{new_image_id}.jpg",
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
answers.to_csv(private / "test.csv", index=False)
|
|
83
|
+
sample_submission.to_csv(public / "sample_submission.csv", index=False)
|
|
84
|
+
new_test.to_csv(public / "test.csv", index=False)
|
|
85
|
+
new_train.to_csv(public / "train.csv", index=False)
|
|
86
|
+
|
|
87
|
+
# =================================================================
|
|
88
|
+
# == STAGE 2: Create the new validation split. ==
|
|
89
|
+
# == This section splits the `new_train` set from STAGE 1 to ==
|
|
90
|
+
# == create a smaller training set and a validation set. ==
|
|
91
|
+
# =================================================================
|
|
92
|
+
print("\nStarting second split to create validation set...")
|
|
93
|
+
|
|
94
|
+
# Define paths for the new validation set directories
|
|
95
|
+
public_val = public.parent / "public_val"
|
|
96
|
+
private_val = private.parent / "private_val"
|
|
97
|
+
|
|
98
|
+
# To get a validation test set of roughly the same size as the original
|
|
99
|
+
# test set (10% of total), we must take 1/9th of the training set (90% of total).
|
|
100
|
+
# (0.1 * total) / (0.9 * total) = 1/9
|
|
101
|
+
val_test_size = 1 / 9.0
|
|
102
|
+
|
|
103
|
+
# Perform the second split on the original training data
|
|
104
|
+
train_val, answers_val = train_test_split(
|
|
105
|
+
train_for_val_split, test_size=val_test_size, random_state=0
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# --- Replicate the ID renaming and file creation logic for the new split ---
|
|
109
|
+
|
|
110
|
+
train_val_image_ids = train_val["image_id"].unique()
|
|
111
|
+
test_val_image_ids = answers_val["image_id"].unique()
|
|
112
|
+
to_new_val_image_id = {
|
|
113
|
+
**{old_id: f"Train_{i}" for i, old_id in enumerate(train_val_image_ids)},
|
|
114
|
+
**{old_id: f"Test_{i}" for i, old_id in enumerate(test_val_image_ids)},
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
# The original image IDs from this split are keys in `to_new_image_id`.
|
|
118
|
+
# We need to map them to find the source file in `public/images`.
|
|
119
|
+
# e.g., raw 'Train_123' -> 1st split 'Train_45' -> 2nd split 'Test_6'
|
|
120
|
+
id_mapper_raw_to_val = {
|
|
121
|
+
raw_id: val_id
|
|
122
|
+
for raw_id, val_id in to_new_val_image_id.items()
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
# Get the intermediate filenames from the first split's mapping
|
|
126
|
+
source_to_dest_val_map = {
|
|
127
|
+
to_new_image_id[raw_id]: val_id
|
|
128
|
+
for raw_id, val_id in id_mapper_raw_to_val.items()
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
train_val["image_id"] = train_val["image_id"].replace(to_new_val_image_id)
|
|
133
|
+
answers_val["image_id"] = answers_val["image_id"].replace(to_new_val_image_id)
|
|
134
|
+
|
|
135
|
+
test_val = answers_val[["image_id"]].copy()
|
|
136
|
+
sample_submission_val = answers_val[["image_id"]].copy()
|
|
137
|
+
sample_submission_val[["healthy", "multiple_diseases", "rust", "scab"]] = 0.25
|
|
138
|
+
|
|
139
|
+
# Create the new directories
|
|
140
|
+
private_val.mkdir(exist_ok=True, parents=True)
|
|
141
|
+
public_val.mkdir(exist_ok=True, parents=True)
|
|
142
|
+
(public_val / "images").mkdir(exist_ok=True)
|
|
143
|
+
|
|
144
|
+
# Copy images for the validation split. The source is the `public/images`
|
|
145
|
+
# directory created in the first stage.
|
|
146
|
+
for source_filename_stem, dest_filename_stem in tqdm(
|
|
147
|
+
source_to_dest_val_map.items(), desc="Copying over validation images"
|
|
148
|
+
):
|
|
149
|
+
shutil.copyfile(
|
|
150
|
+
src=public / "images" / f"{source_filename_stem}.jpg",
|
|
151
|
+
dst=public_val / "images" / f"{dest_filename_stem}.jpg",
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
# Save all files for the validation split into the new directories
|
|
155
|
+
answers_val.to_csv(private_val / "test.csv", index=False)
|
|
156
|
+
sample_submission_val.to_csv(public_val / "sample_submission.csv", index=False)
|
|
157
|
+
test_val.to_csv(public_val / "test.csv", index=False)
|
|
158
|
+
train_val.to_csv(public_val / "train.csv", index=False)
|
|
159
|
+
|
|
160
|
+
print("\nValidation set created successfully.")
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from scipy.sparse import csr_matrix
|
|
3
|
+
from sklearn.metrics import f1_score
|
|
4
|
+
from sklearn.preprocessing import MultiLabelBinarizer
|
|
5
|
+
|
|
6
|
+
from mlebench.grade_helpers import InvalidSubmissionError
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def prepare_for_metric(
|
|
10
|
+
submission: pd.DataFrame, answers: pd.DataFrame
|
|
11
|
+
) -> tuple[csr_matrix, csr_matrix]:
|
|
12
|
+
"""Transforms the submission and answers DataFrames into the required format for grading as sparse matrices."""
|
|
13
|
+
|
|
14
|
+
# Answers checks
|
|
15
|
+
assert (
|
|
16
|
+
"image" in answers.columns and "labels" in answers.columns
|
|
17
|
+
), "Answers DataFrame must have 'image' and 'labels' columns"
|
|
18
|
+
|
|
19
|
+
# Submission checks
|
|
20
|
+
if "image" not in submission.columns:
|
|
21
|
+
raise InvalidSubmissionError("Submission DataFrame must have an 'image' column.")
|
|
22
|
+
if "labels" not in submission.columns:
|
|
23
|
+
raise InvalidSubmissionError("Submission DataFrame must have a 'labels' column.")
|
|
24
|
+
if len(submission) != len(answers):
|
|
25
|
+
raise InvalidSubmissionError(
|
|
26
|
+
"Submission and answers DataFrames must have the same number of rows."
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
# Match order
|
|
30
|
+
submission = submission.sort_values(by="image").reset_index(drop=True)
|
|
31
|
+
answers = answers.sort_values(by="image").reset_index(drop=True)
|
|
32
|
+
if not all(submission["image"] == answers["image"]):
|
|
33
|
+
raise InvalidSubmissionError(
|
|
34
|
+
"Submission and answers DataFrames must have matching 'image' columns."
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
# Get classes
|
|
38
|
+
classes = set(answers["labels"].str.split().explode().unique())
|
|
39
|
+
|
|
40
|
+
# fillna with empty string
|
|
41
|
+
answers["labels"] = answers["labels"].fillna("")
|
|
42
|
+
submission["labels"] = submission["labels"].fillna("")
|
|
43
|
+
|
|
44
|
+
# Convert to sparse matrices using MultiLabelBinarizer
|
|
45
|
+
mlb = MultiLabelBinarizer(classes=sorted(classes), sparse_output=True)
|
|
46
|
+
y_true = mlb.fit_transform(answers["labels"].str.split())
|
|
47
|
+
y_pred = mlb.transform(submission["labels"].str.split())
|
|
48
|
+
|
|
49
|
+
return y_true, y_pred
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
|
|
53
|
+
y_true, y_pred = prepare_for_metric(submission, answers)
|
|
54
|
+
return f1_score(y_true=y_true, y_pred=y_pred, average="micro")
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import shutil
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from sklearn.model_selection import train_test_split
|
|
5
|
+
from tqdm.auto import tqdm
|
|
6
|
+
|
|
7
|
+
from mlebench.utils import extract, read_csv
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def prepare(raw: Path, public: Path, private: Path):
|
|
11
|
+
"""
|
|
12
|
+
Splits the data in raw into public and private datasets with appropriate test/train splits.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
# Create train, test from train split
|
|
16
|
+
old_train = read_csv(raw / "train.csv")
|
|
17
|
+
new_train, answers = train_test_split(old_train, test_size=0.2, random_state=0)
|
|
18
|
+
|
|
19
|
+
# Create a sample submission file
|
|
20
|
+
submission_df = answers.copy()
|
|
21
|
+
submission_df["labels"] = "healthy"
|
|
22
|
+
|
|
23
|
+
# Checks
|
|
24
|
+
assert len(answers) == len(submission_df), "Answers and submission should have the same length"
|
|
25
|
+
assert not set(new_train["image"]).intersection(
|
|
26
|
+
set(answers["image"])
|
|
27
|
+
), "new_train and answers should not share any image"
|
|
28
|
+
assert (
|
|
29
|
+
"image" in new_train.columns and "labels" in new_train.columns
|
|
30
|
+
), "Train DataFrame must have 'image' and 'labels' columns"
|
|
31
|
+
assert (
|
|
32
|
+
"image" in submission_df.columns and "labels" in submission_df.columns
|
|
33
|
+
), "Sample submission DataFrame must have 'image' and 'labels' columns"
|
|
34
|
+
assert len(new_train) + len(answers) == len(
|
|
35
|
+
old_train
|
|
36
|
+
), "The combined length of new_train and answers should equal the length of old_train"
|
|
37
|
+
|
|
38
|
+
# Write CSVs
|
|
39
|
+
answers.to_csv(private / "answers.csv", index=False)
|
|
40
|
+
new_train.to_csv(public / "train.csv", index=False)
|
|
41
|
+
submission_df.to_csv(public / "sample_submission.csv", index=False)
|
|
42
|
+
|
|
43
|
+
# Copy files
|
|
44
|
+
(public / "test_images").mkdir(exist_ok=True)
|
|
45
|
+
(public / "train_images").mkdir(exist_ok=True)
|
|
46
|
+
|
|
47
|
+
for file_id in tqdm(new_train["image"], desc="Copying Train Images"):
|
|
48
|
+
shutil.copyfile(
|
|
49
|
+
src=raw / "train_images" / f"{file_id}",
|
|
50
|
+
dst=public / "train_images" / f"{file_id}",
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
for file_id in tqdm(answers["image"], desc="Copying Test Images"):
|
|
54
|
+
shutil.copyfile(
|
|
55
|
+
src=raw / "train_images" / f"{file_id}",
|
|
56
|
+
dst=public / "test_images" / f"{file_id}",
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# Checks
|
|
60
|
+
assert len(list(public.glob("train_images/*.jpg"))) == len(
|
|
61
|
+
new_train
|
|
62
|
+
), "Public train images should have the same number of images as the train DataFrame"
|
|
63
|
+
assert len(list(public.glob("test_images/*.jpg"))) == len(
|
|
64
|
+
answers
|
|
65
|
+
), "Public test images should have the same number of images as the answers DataFrame"
|