dslighting 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dslighting/__init__.py +1 -1
- dslighting/core/agent.py +78 -62
- {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/METADATA +1 -1
- {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/RECORD +352 -7
- {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/top_level.txt +1 -0
- mlebench/README.md +39 -0
- mlebench/__init__.py +0 -0
- mlebench/cli.py +221 -0
- mlebench/competitions/3d-object-detection-for-autonomous-vehicles/grade.py +161 -0
- mlebench/competitions/3d-object-detection-for-autonomous-vehicles/mAP_evaluation.py +425 -0
- mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare.py +483 -0
- mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare_val.py +719 -0
- mlebench/competitions/AI4Code/grade.py +70 -0
- mlebench/competitions/AI4Code/prepare.py +84 -0
- mlebench/competitions/AI4Code/prepare_val.py +159 -0
- mlebench/competitions/__init__.py +0 -0
- mlebench/competitions/aerial-cactus-identification/grade.py +11 -0
- mlebench/competitions/aerial-cactus-identification/prepare.py +71 -0
- mlebench/competitions/aerial-cactus-identification/prepare_val.py +133 -0
- mlebench/competitions/alaska2-image-steganalysis/grade.py +136 -0
- mlebench/competitions/alaska2-image-steganalysis/prepare.py +88 -0
- mlebench/competitions/alaska2-image-steganalysis/prepare_val.py +148 -0
- mlebench/competitions/aptos2019-blindness-detection/grade.py +35 -0
- mlebench/competitions/aptos2019-blindness-detection/prepare.py +75 -0
- mlebench/competitions/aptos2019-blindness-detection/prepare_val.py +123 -0
- mlebench/competitions/bike-sharing-demand/__init__.py +0 -0
- mlebench/competitions/bike-sharing-demand/grade.py +55 -0
- mlebench/competitions/bike-sharing-demand/prepare.py +37 -0
- mlebench/competitions/billion-word-imputation/grade.py +37 -0
- mlebench/competitions/billion-word-imputation/prepare.py +107 -0
- mlebench/competitions/billion-word-imputation/prepare_val.py +179 -0
- mlebench/competitions/bms-molecular-translation/grade.py +40 -0
- mlebench/competitions/bms-molecular-translation/prepare.py +68 -0
- mlebench/competitions/bms-molecular-translation/prepare_val.py +131 -0
- mlebench/competitions/cassava-leaf-disease-classification/grade.py +12 -0
- mlebench/competitions/cassava-leaf-disease-classification/prepare.py +113 -0
- mlebench/competitions/cassava-leaf-disease-classification/prepare_val.py +186 -0
- mlebench/competitions/cdiscount-image-classification-challenge/grade.py +11 -0
- mlebench/competitions/cdiscount-image-classification-challenge/prepare.py +144 -0
- mlebench/competitions/cdiscount-image-classification-challenge/prepare_val.py +205 -0
- mlebench/competitions/chaii-hindi-and-tamil-question-answering/grade.py +67 -0
- mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare.py +31 -0
- mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare_val.py +94 -0
- mlebench/competitions/champs-scalar-coupling/grade.py +60 -0
- mlebench/competitions/champs-scalar-coupling/prepare.py +116 -0
- mlebench/competitions/champs-scalar-coupling/prepare_val.py +155 -0
- mlebench/competitions/conways-reverse-game-of-life-2020/__init__.py +0 -0
- mlebench/competitions/conways-reverse-game-of-life-2020/grade.py +40 -0
- mlebench/competitions/conways-reverse-game-of-life-2020/prepare.py +41 -0
- mlebench/competitions/demand-forecasting-kernels-only/__init__.py +0 -0
- mlebench/competitions/demand-forecasting-kernels-only/grade.py +66 -0
- mlebench/competitions/demand-forecasting-kernels-only/prepare.py +27 -0
- mlebench/competitions/demand_forecasting_kernels_only/__init__.py +0 -0
- mlebench/competitions/demand_forecasting_kernels_only/grade.py +66 -0
- mlebench/competitions/demand_forecasting_kernels_only/prepare.py +27 -0
- mlebench/competitions/denoising-dirty-documents/grade.py +44 -0
- mlebench/competitions/denoising-dirty-documents/prepare.py +134 -0
- mlebench/competitions/denoising-dirty-documents/prepare_val.py +178 -0
- mlebench/competitions/detecting-insults-in-social-commentary/grade.py +11 -0
- mlebench/competitions/detecting-insults-in-social-commentary/prepare.py +72 -0
- mlebench/competitions/detecting-insults-in-social-commentary/prepare_val.py +128 -0
- mlebench/competitions/dog-breed-identification/dogs.py +124 -0
- mlebench/competitions/dog-breed-identification/grade.py +42 -0
- mlebench/competitions/dog-breed-identification/prepare.py +55 -0
- mlebench/competitions/dog-breed-identification/prepare_val.py +104 -0
- mlebench/competitions/dogs-vs-cats-redux-kernels-edition/grade.py +43 -0
- mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare.py +70 -0
- mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare_val.py +143 -0
- mlebench/competitions/ethanol-concentration/grade.py +23 -0
- mlebench/competitions/ethanol-concentration/prepare.py +90 -0
- mlebench/competitions/facebook-recruiting-iii-keyword-extraction/grade.py +60 -0
- mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare.py +41 -0
- mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare_val.py +92 -0
- mlebench/competitions/feedback-prize-english-language-learning/__init__.py +0 -0
- mlebench/competitions/feedback-prize-english-language-learning/grade.py +60 -0
- mlebench/competitions/feedback-prize-english-language-learning/prepare.py +39 -0
- mlebench/competitions/freesound-audio-tagging-2019/grade.py +64 -0
- mlebench/competitions/freesound-audio-tagging-2019/prepare.py +94 -0
- mlebench/competitions/freesound-audio-tagging-2019/prepare_val.py +175 -0
- mlebench/competitions/freesound-audio-tagging-2019/vocabulary.py +83 -0
- mlebench/competitions/google-quest-challenge/classes.py +32 -0
- mlebench/competitions/google-quest-challenge/grade.py +45 -0
- mlebench/competitions/google-quest-challenge/prepare.py +58 -0
- mlebench/competitions/google-quest-challenge/prepare_val.py +120 -0
- mlebench/competitions/google-research-identify-contrails-reduce-global-warming/grade.py +77 -0
- mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare.py +155 -0
- mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare_val.py +211 -0
- mlebench/competitions/h-and-m-personalized-fashion-recommendations/grade.py +42 -0
- mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare.py +102 -0
- mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare_val.py +132 -0
- mlebench/competitions/handwriting/grade.py +23 -0
- mlebench/competitions/handwriting/prepare.py +179 -0
- mlebench/competitions/herbarium-2020-fgvc7/grade.py +34 -0
- mlebench/competitions/herbarium-2020-fgvc7/prepare.py +251 -0
- mlebench/competitions/herbarium-2020-fgvc7/prepare_val.py +242 -0
- mlebench/competitions/herbarium-2021-fgvc8/grade.py +34 -0
- mlebench/competitions/herbarium-2021-fgvc8/prepare.py +251 -0
- mlebench/competitions/herbarium-2021-fgvc8/prepare_val.py +222 -0
- mlebench/competitions/herbarium-2022-fgvc9/grade.py +31 -0
- mlebench/competitions/herbarium-2022-fgvc9/prepare.py +233 -0
- mlebench/competitions/herbarium-2022-fgvc9/prepare_val.py +213 -0
- mlebench/competitions/histopathologic-cancer-detection/grade.py +12 -0
- mlebench/competitions/histopathologic-cancer-detection/prepare.py +59 -0
- mlebench/competitions/histopathologic-cancer-detection/prepare_val.py +131 -0
- mlebench/competitions/hms-harmful-brain-activity-classification/constants.py +9 -0
- mlebench/competitions/hms-harmful-brain-activity-classification/grade.py +43 -0
- mlebench/competitions/hms-harmful-brain-activity-classification/kaggle_metric_utilities.py +96 -0
- mlebench/competitions/hms-harmful-brain-activity-classification/kullback_leibler_divergence.py +118 -0
- mlebench/competitions/hms-harmful-brain-activity-classification/prepare.py +121 -0
- mlebench/competitions/hms-harmful-brain-activity-classification/prepare_val.py +190 -0
- mlebench/competitions/hotel-id-2021-fgvc8/grade.py +41 -0
- mlebench/competitions/hotel-id-2021-fgvc8/prepare.py +63 -0
- mlebench/competitions/hotel-id-2021-fgvc8/prepare_val.py +132 -0
- mlebench/competitions/hubmap-kidney-segmentation/grade.py +62 -0
- mlebench/competitions/hubmap-kidney-segmentation/prepare.py +108 -0
- mlebench/competitions/hubmap-kidney-segmentation/prepare_val.py +153 -0
- mlebench/competitions/icecube-neutrinos-in-deep-ice/grade.py +111 -0
- mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare.py +127 -0
- mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare_val.py +183 -0
- mlebench/competitions/ili/grade.py +60 -0
- mlebench/competitions/ili/prepare.py +99 -0
- mlebench/competitions/imet-2020-fgvc7/grade.py +54 -0
- mlebench/competitions/imet-2020-fgvc7/prepare.py +77 -0
- mlebench/competitions/imet-2020-fgvc7/prepare_val.py +157 -0
- mlebench/competitions/inaturalist-2019-fgvc6/grade.py +35 -0
- mlebench/competitions/inaturalist-2019-fgvc6/prepare.py +259 -0
- mlebench/competitions/inaturalist-2019-fgvc6/prepare_val.py +304 -0
- mlebench/competitions/instant-gratification/__init__.py +0 -0
- mlebench/competitions/instant-gratification/grade.py +55 -0
- mlebench/competitions/instant-gratification/prepare.py +25 -0
- mlebench/competitions/instant_gratification/__init__.py +0 -0
- mlebench/competitions/instant_gratification/grade.py +55 -0
- mlebench/competitions/instant_gratification/prepare.py +25 -0
- mlebench/competitions/invasive-species-monitoring/grade.py +11 -0
- mlebench/competitions/invasive-species-monitoring/prepare.py +97 -0
- mlebench/competitions/invasive-species-monitoring/prepare_val.py +164 -0
- mlebench/competitions/iwildcam-2019-fgvc6/grade.py +44 -0
- mlebench/competitions/iwildcam-2019-fgvc6/prepare.py +118 -0
- mlebench/competitions/iwildcam-2019-fgvc6/prepare_val.py +194 -0
- mlebench/competitions/iwildcam-2020-fgvc7/grade.py +11 -0
- mlebench/competitions/iwildcam-2020-fgvc7/prepare.py +164 -0
- mlebench/competitions/iwildcam-2020-fgvc7/prepare_val.py +245 -0
- mlebench/competitions/jigsaw-toxic-comment-classification-challenge/classes.py +1 -0
- mlebench/competitions/jigsaw-toxic-comment-classification-challenge/grade.py +54 -0
- mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare.py +42 -0
- mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare_val.py +88 -0
- mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/grade.py +153 -0
- mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare.py +36 -0
- mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare_val.py +117 -0
- mlebench/competitions/kuzushiji-recognition/grade.py +58 -0
- mlebench/competitions/kuzushiji-recognition/kuzushiji_metric.py +118 -0
- mlebench/competitions/kuzushiji-recognition/prepare.py +92 -0
- mlebench/competitions/kuzushiji-recognition/prepare_val.py +149 -0
- mlebench/competitions/leaf-classification/classes.py +101 -0
- mlebench/competitions/leaf-classification/grade.py +44 -0
- mlebench/competitions/leaf-classification/prepare.py +60 -0
- mlebench/competitions/leaf-classification/prepare_val.py +116 -0
- mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/grade.py +44 -0
- mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare.py +51 -0
- mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare_val.py +96 -0
- mlebench/competitions/liverpool-ion-switching/__init__.py +0 -0
- mlebench/competitions/liverpool-ion-switching/grade.py +52 -0
- mlebench/competitions/liverpool-ion-switching/prepare.py +27 -0
- mlebench/competitions/liverpool_ion_switching/__init__.py +0 -0
- mlebench/competitions/liverpool_ion_switching/grade.py +52 -0
- mlebench/competitions/liverpool_ion_switching/prepare.py +27 -0
- mlebench/competitions/lmsys-chatbot-arena/grade.py +63 -0
- mlebench/competitions/lmsys-chatbot-arena/prepare.py +52 -0
- mlebench/competitions/lmsys-chatbot-arena/prepare_val.py +115 -0
- mlebench/competitions/mcm_2024_c_test/grade.py +107 -0
- mlebench/competitions/mcm_2024_c_test/prepare.py +2 -0
- mlebench/competitions/ml2021spring-hw2/grade.py +11 -0
- mlebench/competitions/ml2021spring-hw2/prepare.py +58 -0
- mlebench/competitions/ml2021spring-hw2/prepare_val.py +135 -0
- mlebench/competitions/mlsp-2013-birds/grade.py +11 -0
- mlebench/competitions/mlsp-2013-birds/prepare.py +182 -0
- mlebench/competitions/mlsp-2013-birds/prepare_val.py +241 -0
- mlebench/competitions/movie-review-sentiment-analysis-kernels-only/grade.py +11 -0
- mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare.py +58 -0
- mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare_val.py +120 -0
- mlebench/competitions/multi-modal-gesture-recognition/grade.py +58 -0
- mlebench/competitions/multi-modal-gesture-recognition/prepare.py +85 -0
- mlebench/competitions/multi-modal-gesture-recognition/prepare_val.py +139 -0
- mlebench/competitions/my-custom-task-01/prepare.py +2 -0
- mlebench/competitions/new-my-task-01/prepare.py +2 -0
- mlebench/competitions/new-my-task-03/grade.py +107 -0
- mlebench/competitions/new-my-task-03/prepare.py +2 -0
- mlebench/competitions/new-york-city-taxi-fare-prediction/grade.py +28 -0
- mlebench/competitions/new-york-city-taxi-fare-prediction/prepare.py +44 -0
- mlebench/competitions/new-york-city-taxi-fare-prediction/prepare_val.py +89 -0
- mlebench/competitions/nfl-player-contact-detection/grade.py +36 -0
- mlebench/competitions/nfl-player-contact-detection/prepare.py +101 -0
- mlebench/competitions/nfl-player-contact-detection/prepare_val.py +186 -0
- mlebench/competitions/nomad2018-predict-transparent-conductors/grade.py +47 -0
- mlebench/competitions/nomad2018-predict-transparent-conductors/prepare.py +77 -0
- mlebench/competitions/nomad2018-predict-transparent-conductors/prepare_val.py +144 -0
- mlebench/competitions/osic-pulmonary-fibrosis-progression/grade.py +74 -0
- mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare.py +95 -0
- mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare_val.py +167 -0
- mlebench/competitions/paddy-disease-classification/grade.py +35 -0
- mlebench/competitions/paddy-disease-classification/prepare.py +69 -0
- mlebench/competitions/paddy-disease-classification/prepare_val.py +122 -0
- mlebench/competitions/petfinder-pawpularity-score/grade.py +41 -0
- mlebench/competitions/petfinder-pawpularity-score/prepare.py +76 -0
- mlebench/competitions/petfinder-pawpularity-score/prepare_val.py +154 -0
- mlebench/competitions/plant-pathology-2020-fgvc7/grade.py +41 -0
- mlebench/competitions/plant-pathology-2020-fgvc7/prepare.py +74 -0
- mlebench/competitions/plant-pathology-2020-fgvc7/prepare_val.py +160 -0
- mlebench/competitions/plant-pathology-2021-fgvc8/grade.py +54 -0
- mlebench/competitions/plant-pathology-2021-fgvc8/prepare.py +65 -0
- mlebench/competitions/plant-pathology-2021-fgvc8/prepare_val.py +130 -0
- mlebench/competitions/plant-seedlings-classification/grade.py +39 -0
- mlebench/competitions/plant-seedlings-classification/prepare.py +91 -0
- mlebench/competitions/plant-seedlings-classification/prepare_val.py +158 -0
- mlebench/competitions/playground-series-s3e1/__init__.py +0 -0
- mlebench/competitions/playground-series-s3e1/grade.py +52 -0
- mlebench/competitions/playground-series-s3e1/prepare.py +25 -0
- mlebench/competitions/playground-series-s3e11/__init__.py +0 -0
- mlebench/competitions/playground-series-s3e11/grade.py +55 -0
- mlebench/competitions/playground-series-s3e11/prepare.py +25 -0
- mlebench/competitions/playground-series-s3e18/grade.py +39 -0
- mlebench/competitions/playground-series-s3e18/prepare.py +36 -0
- mlebench/competitions/playground-series-s3e18/prepare_val.py +89 -0
- mlebench/competitions/playground_series_s3e1/__init__.py +0 -0
- mlebench/competitions/playground_series_s3e1/grade.py +52 -0
- mlebench/competitions/playground_series_s3e1/prepare.py +25 -0
- mlebench/competitions/playground_series_s3e11/__init__.py +0 -0
- mlebench/competitions/playground_series_s3e11/grade.py +55 -0
- mlebench/competitions/playground_series_s3e11/prepare.py +25 -0
- mlebench/competitions/predict-volcanic-eruptions-ingv-oe/grade.py +44 -0
- mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare.py +68 -0
- mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare_val.py +146 -0
- mlebench/competitions/random-acts-of-pizza/grade.py +14 -0
- mlebench/competitions/random-acts-of-pizza/prepare.py +80 -0
- mlebench/competitions/random-acts-of-pizza/prepare_val.py +144 -0
- mlebench/competitions/ranzcr-clip-catheter-line-classification/classes.py +11 -0
- mlebench/competitions/ranzcr-clip-catheter-line-classification/grade.py +31 -0
- mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare.py +53 -0
- mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare_val.py +113 -0
- mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/grade.py +124 -0
- mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare.py +219 -0
- mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare_val.py +257 -0
- mlebench/competitions/rsna-breast-cancer-detection/grade.py +65 -0
- mlebench/competitions/rsna-breast-cancer-detection/prepare.py +141 -0
- mlebench/competitions/rsna-breast-cancer-detection/prepare_val.py +201 -0
- mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/grade.py +13 -0
- mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare.py +47 -0
- mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare_val.py +97 -0
- mlebench/competitions/santander-customer-satisfaction/grade.py +10 -0
- mlebench/competitions/santander-customer-satisfaction/prepare.py +41 -0
- mlebench/competitions/sciencebench-001-clintox-nn/__init__.py +0 -0
- mlebench/competitions/sciencebench-001-clintox-nn/grade.py +56 -0
- mlebench/competitions/sciencebench-001-clintox-nn/prepare.py +75 -0
- mlebench/competitions/sciencebench-015-aai/grade.py +37 -0
- mlebench/competitions/sciencebench-015-aai/prepare.py +102 -0
- mlebench/competitions/sciencebench-051-brain-blood-qsar/grade.py +58 -0
- mlebench/competitions/sciencebench-051-brain-blood-qsar/prepare.py +69 -0
- mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/grade.py +55 -0
- mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/prepare.py +88 -0
- mlebench/competitions/see-click-predict-fix/__init__.py +0 -0
- mlebench/competitions/see-click-predict-fix/grade.py +66 -0
- mlebench/competitions/see-click-predict-fix/prepare.py +25 -0
- mlebench/competitions/see_click_predict_fix/__init__.py +0 -0
- mlebench/competitions/see_click_predict_fix/grade.py +66 -0
- mlebench/competitions/see_click_predict_fix/prepare.py +25 -0
- mlebench/competitions/seti-breakthrough-listen/grade.py +11 -0
- mlebench/competitions/seti-breakthrough-listen/prepare.py +71 -0
- mlebench/competitions/seti-breakthrough-listen/prepare_val.py +159 -0
- mlebench/competitions/siim-covid19-detection/grade.py +194 -0
- mlebench/competitions/siim-covid19-detection/prepare.py +123 -0
- mlebench/competitions/siim-covid19-detection/prepare_val.py +164 -0
- mlebench/competitions/siim-isic-melanoma-classification/grade.py +11 -0
- mlebench/competitions/siim-isic-melanoma-classification/prepare.py +127 -0
- mlebench/competitions/siim-isic-melanoma-classification/prepare_val.py +158 -0
- mlebench/competitions/smartphone-decimeter-2022/grade.py +55 -0
- mlebench/competitions/smartphone-decimeter-2022/notebook.py +86 -0
- mlebench/competitions/smartphone-decimeter-2022/prepare.py +143 -0
- mlebench/competitions/smartphone-decimeter-2022/prepare_val.py +199 -0
- mlebench/competitions/spaceship-titanic/grade.py +11 -0
- mlebench/competitions/spaceship-titanic/prepare.py +23 -0
- mlebench/competitions/spaceship-titanic/prepare_val.py +61 -0
- mlebench/competitions/spooky-author-identification/classes.py +1 -0
- mlebench/competitions/spooky-author-identification/grade.py +38 -0
- mlebench/competitions/spooky-author-identification/prepare.py +40 -0
- mlebench/competitions/spooky-author-identification/prepare_val.py +78 -0
- mlebench/competitions/stanford-covid-vaccine/grade.py +65 -0
- mlebench/competitions/stanford-covid-vaccine/prepare.py +129 -0
- mlebench/competitions/stanford-covid-vaccine/prepare_val.py +199 -0
- mlebench/competitions/statoil-iceberg-classifier-challenge/grade.py +41 -0
- mlebench/competitions/statoil-iceberg-classifier-challenge/prepare.py +105 -0
- mlebench/competitions/statoil-iceberg-classifier-challenge/prepare_val.py +157 -0
- mlebench/competitions/tabular-playground-series-dec-2021/grade.py +11 -0
- mlebench/competitions/tabular-playground-series-dec-2021/prepare.py +39 -0
- mlebench/competitions/tabular-playground-series-dec-2021/prepare_val.py +99 -0
- mlebench/competitions/tabular-playground-series-may-2022/grade.py +9 -0
- mlebench/competitions/tabular-playground-series-may-2022/prepare.py +56 -0
- mlebench/competitions/tabular-playground-series-may-2022/prepare_val.py +116 -0
- mlebench/competitions/tensorflow-speech-recognition-challenge/grade.py +11 -0
- mlebench/competitions/tensorflow-speech-recognition-challenge/prepare.py +90 -0
- mlebench/competitions/tensorflow-speech-recognition-challenge/prepare_val.py +148 -0
- mlebench/competitions/tensorflow2-question-answering/grade.py +122 -0
- mlebench/competitions/tensorflow2-question-answering/prepare.py +122 -0
- mlebench/competitions/tensorflow2-question-answering/prepare_val.py +187 -0
- mlebench/competitions/text-normalization-challenge-english-language/grade.py +49 -0
- mlebench/competitions/text-normalization-challenge-english-language/prepare.py +115 -0
- mlebench/competitions/text-normalization-challenge-english-language/prepare_val.py +213 -0
- mlebench/competitions/text-normalization-challenge-russian-language/grade.py +49 -0
- mlebench/competitions/text-normalization-challenge-russian-language/prepare.py +113 -0
- mlebench/competitions/text-normalization-challenge-russian-language/prepare_val.py +165 -0
- mlebench/competitions/tgs-salt-identification-challenge/grade.py +144 -0
- mlebench/competitions/tgs-salt-identification-challenge/prepare.py +158 -0
- mlebench/competitions/tgs-salt-identification-challenge/prepare_val.py +166 -0
- mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/grade.py +11 -0
- mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare.py +95 -0
- mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare_val.py +141 -0
- mlebench/competitions/tmdb-box-office-prediction/__init__.py +0 -0
- mlebench/competitions/tmdb-box-office-prediction/grade.py +55 -0
- mlebench/competitions/tmdb-box-office-prediction/prepare.py +35 -0
- mlebench/competitions/tweet-sentiment-extraction/grade.py +67 -0
- mlebench/competitions/tweet-sentiment-extraction/prepare.py +36 -0
- mlebench/competitions/tweet-sentiment-extraction/prepare_val.py +106 -0
- mlebench/competitions/us-patent-phrase-to-phrase-matching/grade.py +31 -0
- mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare.py +33 -0
- mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare_val.py +71 -0
- mlebench/competitions/utils.py +266 -0
- mlebench/competitions/uw-madison-gi-tract-image-segmentation/grade.py +158 -0
- mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare.py +139 -0
- mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare_val.py +193 -0
- mlebench/competitions/ventilator-pressure-prediction/__init__.py +0 -0
- mlebench/competitions/ventilator-pressure-prediction/grade.py +52 -0
- mlebench/competitions/ventilator-pressure-prediction/prepare.py +27 -0
- mlebench/competitions/ventilator-pressure-prediction/prepare_val.py +142 -0
- mlebench/competitions/ventilator_pressure_prediction/__init__.py +0 -0
- mlebench/competitions/ventilator_pressure_prediction/grade.py +52 -0
- mlebench/competitions/ventilator_pressure_prediction/prepare.py +27 -0
- mlebench/competitions/vesuvius-challenge-ink-detection/grade.py +97 -0
- mlebench/competitions/vesuvius-challenge-ink-detection/prepare.py +122 -0
- mlebench/competitions/vesuvius-challenge-ink-detection/prepare_val.py +170 -0
- mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/grade.py +220 -0
- mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare.py +129 -0
- mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare_val.py +204 -0
- mlebench/competitions/whale-categorization-playground/grade.py +41 -0
- mlebench/competitions/whale-categorization-playground/prepare.py +103 -0
- mlebench/competitions/whale-categorization-playground/prepare_val.py +196 -0
- mlebench/data.py +420 -0
- mlebench/grade.py +209 -0
- mlebench/grade_helpers.py +235 -0
- mlebench/metrics.py +75 -0
- mlebench/registry.py +332 -0
- mlebench/utils.py +346 -0
- {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/WHEEL +0 -0
- {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from pandas import DataFrame
|
|
4
|
+
|
|
5
|
+
from mlebench.grade_helpers import InvalidSubmissionError
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def prepare_for_metric(submission: DataFrame, answers: DataFrame) -> dict:
|
|
9
|
+
|
|
10
|
+
assert "Patient_Week" in answers.columns, "Answers DataFrame must have a 'Patient_Week' column."
|
|
11
|
+
assert "FVC" in answers.columns, "Answers DataFrame must have a 'FVC' column."
|
|
12
|
+
assert "Patient" in answers.columns, "Answers DataFrame must have a 'Patient' column."
|
|
13
|
+
if "Patient_Week" not in submission.columns:
|
|
14
|
+
raise InvalidSubmissionError("Submission DataFrame must have a 'Patient_Week' column.")
|
|
15
|
+
if "FVC" not in submission.columns:
|
|
16
|
+
raise InvalidSubmissionError("Submission DataFrame must have a 'FVC' column.")
|
|
17
|
+
if "Confidence" not in submission.columns:
|
|
18
|
+
raise InvalidSubmissionError("Submission DataFrame must have a 'Confidence' column.")
|
|
19
|
+
for pw in submission["Patient_Week"]:
|
|
20
|
+
if pw not in answers["Patient_Week"].values:
|
|
21
|
+
raise InvalidSubmissionError(
|
|
22
|
+
f"Patient_Week {pw} in submission does not exist in answers"
|
|
23
|
+
)
|
|
24
|
+
if not pd.api.types.is_numeric_dtype(submission["FVC"]):
|
|
25
|
+
raise InvalidSubmissionError("FVC column in submission must be numeric.")
|
|
26
|
+
if not pd.api.types.is_numeric_dtype(submission["Confidence"]):
|
|
27
|
+
raise InvalidSubmissionError("Confidence column in submission must be numeric.")
|
|
28
|
+
|
|
29
|
+
# fillna with 0 for the confidence column
|
|
30
|
+
submission["Confidence"] = submission["Confidence"].fillna(0)
|
|
31
|
+
|
|
32
|
+
# We should only take the 3 latest Patient_Week records for each patient
|
|
33
|
+
answers = answers.dropna(subset=["FVC"]) # Drop the dummy data
|
|
34
|
+
answers["Week_Number"] = answers["Patient_Week"].apply(lambda x: int(x.split("_")[-1]))
|
|
35
|
+
latest_weeks = answers.sort_values("Week_Number").groupby("Patient").tail(3)
|
|
36
|
+
answers = latest_weeks.drop(columns=["Week_Number"])
|
|
37
|
+
# Make submission match; we only grade the prediction for the 3 latest weeks
|
|
38
|
+
submission = submission[submission["Patient_Week"].isin(answers["Patient_Week"])]
|
|
39
|
+
|
|
40
|
+
submission = submission.sort_values(by="Patient_Week")
|
|
41
|
+
answers = answers.sort_values(by="Patient_Week")
|
|
42
|
+
|
|
43
|
+
fvc_true = answers.loc[answers["Patient_Week"].isin(submission["Patient_Week"]), "FVC"].values
|
|
44
|
+
fvc_pred = submission.loc[
|
|
45
|
+
submission["Patient_Week"].isin(answers["Patient_Week"]), "FVC"
|
|
46
|
+
].values
|
|
47
|
+
confidence = submission.loc[
|
|
48
|
+
submission["Patient_Week"].isin(answers["Patient_Week"]), "Confidence"
|
|
49
|
+
].values
|
|
50
|
+
|
|
51
|
+
return {"fvc_true": fvc_true, "fvc_pred": fvc_pred, "confidence": confidence}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def laplace_log_likelihood(
|
|
55
|
+
actual_fvc: np.ndarray, predicted_fvc: np.ndarray, confidence: np.ndarray, return_values=False
|
|
56
|
+
) -> float:
|
|
57
|
+
"""
|
|
58
|
+
Calculates the modified Laplace Log Likelihood score for osic-pulmonary-fibrosis-progression
|
|
59
|
+
See https://www.kaggle.com/code/rohanrao/osic-understanding-laplace-log-likelihood for details on this implementation
|
|
60
|
+
"""
|
|
61
|
+
sd_clipped = np.maximum(confidence, 70)
|
|
62
|
+
delta = np.minimum(np.abs(actual_fvc - predicted_fvc), 1000)
|
|
63
|
+
metric = -np.sqrt(2) * delta / sd_clipped - np.log(np.sqrt(2) * sd_clipped)
|
|
64
|
+
|
|
65
|
+
if return_values:
|
|
66
|
+
return metric
|
|
67
|
+
else:
|
|
68
|
+
return np.mean(metric)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def grade(submission: DataFrame, answers: DataFrame) -> float:
|
|
72
|
+
fvc_true, fvc_pred, confidence = prepare_for_metric(submission, answers).values()
|
|
73
|
+
|
|
74
|
+
return laplace_log_likelihood(fvc_true, fvc_pred, confidence)
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
import shutil
|
|
2
|
+
from multiprocessing import Pool
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from sklearn.model_selection import train_test_split
|
|
7
|
+
from tqdm import tqdm
|
|
8
|
+
|
|
9
|
+
from mlebench.utils import read_csv
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def copy_dir(args):
|
|
13
|
+
src_dir, dst_dir = args
|
|
14
|
+
shutil.copytree(src=src_dir, dst=dst_dir, dirs_exist_ok=True)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def prepare(raw: Path, public: Path, private: Path):
|
|
18
|
+
# Splitting the train set into train and test with unique Patients
|
|
19
|
+
old_train = read_csv(raw / "train.csv")
|
|
20
|
+
grouped_by_patient = list(old_train.groupby("Patient"))
|
|
21
|
+
train_groups, test_groups = train_test_split(grouped_by_patient, test_size=0.1, random_state=0)
|
|
22
|
+
new_train = pd.concat([group for _, group in train_groups])
|
|
23
|
+
new_test = pd.concat([group for _, group in test_groups])
|
|
24
|
+
assert set(new_train["Patient"]).isdisjoint(
|
|
25
|
+
set(new_test["Patient"])
|
|
26
|
+
), "There are Patients who are in both train and test sets."
|
|
27
|
+
|
|
28
|
+
# For the public new_test set we will only keep each patients first FVS measurement. The task is to predict FVS measurements for all possible weeks
|
|
29
|
+
new_test_public = new_test.sort_values(by="Weeks").groupby("Patient").first().reset_index()
|
|
30
|
+
|
|
31
|
+
# Creating the private answers CSV. We need to fill out dummy FVS measurements for all weeks that don't have data so as to match sample_submission.csv
|
|
32
|
+
# Create a DataFrame with all possible Patient-Week combinations
|
|
33
|
+
all_weeks = pd.DataFrame(
|
|
34
|
+
[
|
|
35
|
+
(patient, week)
|
|
36
|
+
for patient in new_test["Patient"].unique()
|
|
37
|
+
for week in range(new_test["Weeks"].min(), new_test["Weeks"].max() + 1)
|
|
38
|
+
],
|
|
39
|
+
columns=["Patient", "Weeks"],
|
|
40
|
+
)
|
|
41
|
+
# Merge with the new_test DataFrame to fill in missing weeks with NaN values
|
|
42
|
+
new_test = all_weeks.merge(new_test, on=["Patient", "Weeks"], how="left")
|
|
43
|
+
new_test["Patient_Week"] = new_test["Patient"] + "_" + new_test["Weeks"].astype(str)
|
|
44
|
+
new_test["Confidence"] = 100
|
|
45
|
+
assert (
|
|
46
|
+
new_test.groupby("Patient").size().nunique() == 1
|
|
47
|
+
), "Not all patients have the same number of rows."
|
|
48
|
+
|
|
49
|
+
# Create a sample submission file
|
|
50
|
+
submission_df = new_test.copy()
|
|
51
|
+
submission_df = submission_df[["Patient_Week"]]
|
|
52
|
+
submission_df["FVC"] = 2000 # Dummy predictions
|
|
53
|
+
submission_df["Confidence"] = 100 # Dummy confidence
|
|
54
|
+
|
|
55
|
+
# Write CSVs
|
|
56
|
+
new_train.to_csv(public / "train.csv", index=False)
|
|
57
|
+
new_test_public.to_csv(public / "test.csv", index=False)
|
|
58
|
+
new_test.to_csv(private / "test.csv", index=False)
|
|
59
|
+
submission_df.to_csv(public / "sample_submission.csv", index=False)
|
|
60
|
+
|
|
61
|
+
# Copy over data files
|
|
62
|
+
(public / "train").mkdir(exist_ok=True)
|
|
63
|
+
train_args = [
|
|
64
|
+
(raw / "train" / patient, public / "train" / patient)
|
|
65
|
+
for patient in new_train["Patient"].unique()
|
|
66
|
+
]
|
|
67
|
+
with Pool() as pool:
|
|
68
|
+
list(
|
|
69
|
+
tqdm(pool.imap(copy_dir, train_args), total=len(train_args), desc="Copying train data")
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
(public / "test").mkdir(exist_ok=True)
|
|
73
|
+
test_args = [
|
|
74
|
+
(raw / "train" / patient, public / "test" / patient)
|
|
75
|
+
for patient in new_test["Patient"].unique()
|
|
76
|
+
]
|
|
77
|
+
with Pool() as pool:
|
|
78
|
+
list(tqdm(pool.imap(copy_dir, test_args), total=len(test_args), desc="Copying test data"))
|
|
79
|
+
|
|
80
|
+
# Final checks
|
|
81
|
+
assert new_train.shape[1] == 7, f"Expected 7 columns in new_train, but got {new_train.shape[1]}"
|
|
82
|
+
assert new_test.shape[1] == 9, f"Expected 9 columns in new_test, but got {new_test.shape[1]}"
|
|
83
|
+
assert (
|
|
84
|
+
new_test_public.shape[1] == 7
|
|
85
|
+
), f"Expected 7 columns in new_test_public, but got {new_test_public.shape[1]}"
|
|
86
|
+
assert (
|
|
87
|
+
submission_df.shape[1] == 3
|
|
88
|
+
), f"Expected 3 columns in submission_df, but got {submission_df.shape[1]}"
|
|
89
|
+
|
|
90
|
+
public_train_dirs = set((public / "train").iterdir())
|
|
91
|
+
public_test_dirs = set((public / "test").iterdir())
|
|
92
|
+
common_dirs = public_train_dirs.intersection(public_test_dirs)
|
|
93
|
+
assert (
|
|
94
|
+
not common_dirs
|
|
95
|
+
), f"There are directories with the same name in public train and test: {common_dirs}"
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
import shutil
|
|
2
|
+
from multiprocessing import Pool
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import List, Tuple
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from sklearn.model_selection import train_test_split
|
|
8
|
+
from tqdm import tqdm
|
|
9
|
+
|
|
10
|
+
from mlebench.utils import read_csv
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def copy_dir(args):
|
|
14
|
+
src_dir, dst_dir = args
|
|
15
|
+
shutil.copytree(src=src_dir, dst=dst_dir, dirs_exist_ok=True)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _create_dataset_split(
|
|
19
|
+
train_groups: List[Tuple[str, pd.DataFrame]],
|
|
20
|
+
test_groups: List[Tuple[str, pd.DataFrame]],
|
|
21
|
+
raw_path: Path,
|
|
22
|
+
public_path: Path,
|
|
23
|
+
private_path: Path,
|
|
24
|
+
):
|
|
25
|
+
"""
|
|
26
|
+
Helper function to process a single train/test split of patient groups.
|
|
27
|
+
|
|
28
|
+
This function takes lists of train and test patient groups, creates the necessary
|
|
29
|
+
DataFrames, saves them to the specified public and private directories, and copies
|
|
30
|
+
the associated patient data folders. This logic is used for both the original
|
|
31
|
+
train/test split and the new train/validation split.
|
|
32
|
+
"""
|
|
33
|
+
# Ensure output directories exist
|
|
34
|
+
public_path.mkdir(exist_ok=True)
|
|
35
|
+
private_path.mkdir(exist_ok=True)
|
|
36
|
+
|
|
37
|
+
# Recreate DataFrames from the split groups
|
|
38
|
+
new_train = pd.concat([group for _, group in train_groups])
|
|
39
|
+
new_test = pd.concat([group for _, group in test_groups])
|
|
40
|
+
assert set(new_train["Patient"]).isdisjoint(
|
|
41
|
+
set(new_test["Patient"])
|
|
42
|
+
), "There are Patients who are in both train and test sets."
|
|
43
|
+
|
|
44
|
+
# For the public new_test set we will only keep each patients first FVS measurement. The task is to predict FVS measurements for all possible weeks
|
|
45
|
+
new_test_public = new_test.sort_values(by="Weeks").groupby("Patient").first().reset_index()
|
|
46
|
+
|
|
47
|
+
# Creating the private answers CSV. We need to fill out dummy FVS measurements for all weeks that don't have data so as to match sample_submission.csv
|
|
48
|
+
# Create a DataFrame with all possible Patient-Week combinations
|
|
49
|
+
all_weeks = pd.DataFrame(
|
|
50
|
+
[
|
|
51
|
+
(patient, week)
|
|
52
|
+
for patient in new_test["Patient"].unique()
|
|
53
|
+
for week in range(new_test["Weeks"].min(), new_test["Weeks"].max() + 1)
|
|
54
|
+
],
|
|
55
|
+
columns=["Patient", "Weeks"],
|
|
56
|
+
)
|
|
57
|
+
# Merge with the new_test DataFrame to fill in missing weeks with NaN values
|
|
58
|
+
new_test_private = all_weeks.merge(new_test, on=["Patient", "Weeks"], how="left")
|
|
59
|
+
new_test_private["Patient_Week"] = (
|
|
60
|
+
new_test_private["Patient"] + "_" + new_test_private["Weeks"].astype(str)
|
|
61
|
+
)
|
|
62
|
+
new_test_private["Confidence"] = 100
|
|
63
|
+
assert (
|
|
64
|
+
new_test_private.groupby("Patient").size().nunique() == 1
|
|
65
|
+
), "Not all patients have the same number of rows."
|
|
66
|
+
|
|
67
|
+
# Create a sample submission file
|
|
68
|
+
submission_df = new_test_private.copy()
|
|
69
|
+
submission_df = submission_df[["Patient_Week"]]
|
|
70
|
+
submission_df["FVC"] = 2000 # Dummy predictions
|
|
71
|
+
submission_df["Confidence"] = 100 # Dummy confidence
|
|
72
|
+
|
|
73
|
+
# Write CSVs
|
|
74
|
+
new_train.to_csv(public_path / "train.csv", index=False)
|
|
75
|
+
new_test_public.to_csv(public_path / "test.csv", index=False)
|
|
76
|
+
new_test_private.to_csv(private_path / "test.csv", index=False)
|
|
77
|
+
submission_df.to_csv(public_path / "sample_submission.csv", index=False)
|
|
78
|
+
|
|
79
|
+
# Copy over data files
|
|
80
|
+
(public_path / "train").mkdir(exist_ok=True)
|
|
81
|
+
train_args = [
|
|
82
|
+
(raw_path / "train" / patient, public_path / "train" / patient)
|
|
83
|
+
for patient in new_train["Patient"].unique()
|
|
84
|
+
]
|
|
85
|
+
with Pool() as pool:
|
|
86
|
+
list(
|
|
87
|
+
tqdm(
|
|
88
|
+
pool.imap(copy_dir, train_args),
|
|
89
|
+
total=len(train_args),
|
|
90
|
+
desc=f"Copying train data to {public_path.name}",
|
|
91
|
+
)
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
(public_path / "test").mkdir(exist_ok=True)
|
|
95
|
+
test_args = [
|
|
96
|
+
(raw_path / "train" / patient, public_path / "test" / patient)
|
|
97
|
+
for patient in new_test["Patient"].unique()
|
|
98
|
+
]
|
|
99
|
+
with Pool() as pool:
|
|
100
|
+
list(
|
|
101
|
+
tqdm(
|
|
102
|
+
pool.imap(copy_dir, test_args),
|
|
103
|
+
total=len(test_args),
|
|
104
|
+
desc=f"Copying test data to {public_path.name}",
|
|
105
|
+
)
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# Final checks
|
|
109
|
+
assert new_train.shape[1] == 7, f"Expected 7 columns in new_train, but got {new_train.shape[1]}"
|
|
110
|
+
assert (
|
|
111
|
+
new_test_private.shape[1] == 9
|
|
112
|
+
), f"Expected 9 columns in new_test, but got {new_test_private.shape[1]}"
|
|
113
|
+
assert (
|
|
114
|
+
new_test_public.shape[1] == 7
|
|
115
|
+
), f"Expected 7 columns in new_test_public, but got {new_test_public.shape[1]}"
|
|
116
|
+
assert (
|
|
117
|
+
submission_df.shape[1] == 3
|
|
118
|
+
), f"Expected 3 columns in submission_df, but got {submission_df.shape[1]}"
|
|
119
|
+
|
|
120
|
+
public_train_dirs = set((public_path / "train").iterdir())
|
|
121
|
+
public_test_dirs = set((public_path / "test").iterdir())
|
|
122
|
+
common_dirs = public_train_dirs.intersection(public_test_dirs)
|
|
123
|
+
assert (
|
|
124
|
+
not common_dirs
|
|
125
|
+
), f"There are directories with the same name in public train and test: {common_dirs}"
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def prepare(raw: Path, public: Path, private: Path):
|
|
129
|
+
# Read raw data and group by patient to ensure patient-level splits
|
|
130
|
+
old_train = read_csv(raw / "train.csv")
|
|
131
|
+
grouped_by_patient = list(old_train.groupby("Patient"))
|
|
132
|
+
|
|
133
|
+
# ---- 1. Create the original train/test split ----
|
|
134
|
+
# This split creates the main competition data in `public` and `private`
|
|
135
|
+
train_groups, test_groups = train_test_split(
|
|
136
|
+
grouped_by_patient, test_size=0.1, random_state=0
|
|
137
|
+
)
|
|
138
|
+
_create_dataset_split(
|
|
139
|
+
train_groups=train_groups,
|
|
140
|
+
test_groups=test_groups,
|
|
141
|
+
raw_path=raw,
|
|
142
|
+
public_path=public,
|
|
143
|
+
private_path=private,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
# ---- 2. Create the new train/validation split ----
|
|
147
|
+
# This second split uses the `train_groups` from the first split to create
|
|
148
|
+
# a smaller training set and a validation set. The outputs are saved in
|
|
149
|
+
# parallel directories (`public_val`, `private_val`) to avoid altering
|
|
150
|
+
# the original competition files.
|
|
151
|
+
|
|
152
|
+
# Define new paths for the validation split
|
|
153
|
+
public_val = public.parent / "public_val"
|
|
154
|
+
private_val = private.parent / "private_val"
|
|
155
|
+
|
|
156
|
+
# Split the training data again to create a validation set.
|
|
157
|
+
# We use the *same logic and random_state* as the original split.
|
|
158
|
+
train_val_groups, test_val_groups = train_test_split(
|
|
159
|
+
train_groups, test_size=0.1, random_state=0
|
|
160
|
+
)
|
|
161
|
+
_create_dataset_split(
|
|
162
|
+
train_groups=train_val_groups,
|
|
163
|
+
test_groups=test_val_groups,
|
|
164
|
+
raw_path=raw,
|
|
165
|
+
public_path=public_val,
|
|
166
|
+
private_path=private_val,
|
|
167
|
+
)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from pandas import DataFrame
|
|
2
|
+
from sklearn.metrics import accuracy_score
|
|
3
|
+
|
|
4
|
+
from mlebench.grade_helpers import InvalidSubmissionError
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def grade(submission: DataFrame, answers: DataFrame) -> float:
|
|
8
|
+
expected_columns = set(["image_id", "label"])
|
|
9
|
+
|
|
10
|
+
assert set(answers.columns).issuperset(
|
|
11
|
+
expected_columns
|
|
12
|
+
), f"Expected columns {expected_columns}, but got {set(answers.columns)}"
|
|
13
|
+
|
|
14
|
+
assert not answers.isnull().values.any(), "Answers contain NaN values"
|
|
15
|
+
|
|
16
|
+
if not set(submission.columns).issuperset(expected_columns):
|
|
17
|
+
raise InvalidSubmissionError(
|
|
18
|
+
f"Expected columns {expected_columns}, but got {set(submission.columns)}"
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
if len(submission) != len(answers):
|
|
22
|
+
raise InvalidSubmissionError(f"Expected {len(answers)} rows, but got {len(submission)}")
|
|
23
|
+
|
|
24
|
+
sorted_submission = submission.sort_values(by="image_id").reset_index(drop=True)
|
|
25
|
+
sorted_answers = answers.sort_values(by="image_id").reset_index(drop=True)
|
|
26
|
+
|
|
27
|
+
if (sorted_submission["image_id"].values != sorted_answers["image_id"].values).any():
|
|
28
|
+
raise InvalidSubmissionError("Invalid image IDs in the submission!")
|
|
29
|
+
|
|
30
|
+
y_pred = sorted_submission["label"].fillna("").tolist()
|
|
31
|
+
y_true = sorted_answers["label"]
|
|
32
|
+
|
|
33
|
+
score = accuracy_score(y_true=y_true, y_pred=y_pred)
|
|
34
|
+
|
|
35
|
+
return score
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import shutil
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from sklearn.model_selection import train_test_split
|
|
5
|
+
|
|
6
|
+
from mlebench.utils import read_csv
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def copy(src: Path, dst: Path):
|
|
10
|
+
"""A wrapper for `shutil.copy` which creates destination directories when they don't exist."""
|
|
11
|
+
|
|
12
|
+
assert src.exists(), f"{src} does not exist"
|
|
13
|
+
assert not dst.exists(), f"{dst} already exists"
|
|
14
|
+
dst.parent.mkdir(parents=True, exist_ok=True)
|
|
15
|
+
shutil.copy(src, dst)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def prepare(raw: Path, public: Path, private: Path):
|
|
19
|
+
old_train = read_csv(raw / "train.csv")
|
|
20
|
+
old_sample_submission = read_csv(raw / "sample_submission.csv")
|
|
21
|
+
|
|
22
|
+
# The original dataset has 10,407 train images and 3,469 test images.
|
|
23
|
+
# This implies a 75%/25% train/test split.
|
|
24
|
+
new_train, new_test = train_test_split(old_train, train_size=0.75, random_state=0)
|
|
25
|
+
|
|
26
|
+
new_sample_submission = new_test[["image_id", "label"]].copy()
|
|
27
|
+
new_sample_submission["label"] = ""
|
|
28
|
+
|
|
29
|
+
new_train.to_csv(public / "train.csv", index=False)
|
|
30
|
+
new_sample_submission.to_csv(public / "sample_submission.csv", index=False)
|
|
31
|
+
new_test.to_csv(private / "test.csv", index=False)
|
|
32
|
+
|
|
33
|
+
for row in new_train.itertuples():
|
|
34
|
+
copy(
|
|
35
|
+
raw / "train_images" / row.label / row.image_id,
|
|
36
|
+
public / "train_images" / row.label / row.image_id,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
for row in new_test.itertuples():
|
|
40
|
+
copy(
|
|
41
|
+
raw / "train_images" / row.label / row.image_id,
|
|
42
|
+
public / "test_images" / row.image_id,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
# Sanity checks
|
|
46
|
+
assert len(new_train) + len(new_test) == len(
|
|
47
|
+
old_train
|
|
48
|
+
), f"Expected the combined size of the new train and test sets to be the same size as the original train set, but it wasn't! Got {len(new_train) + len(new_test)} != {len(old_train)}."
|
|
49
|
+
|
|
50
|
+
new_train_image_ids = set(new_train.image_id)
|
|
51
|
+
new_test_image_ids = set(new_test.image_id)
|
|
52
|
+
|
|
53
|
+
assert new_train_image_ids.isdisjoint(new_test_image_ids), "Train and test sets overlap!"
|
|
54
|
+
|
|
55
|
+
assert set(new_train.columns) == set(
|
|
56
|
+
old_train.columns
|
|
57
|
+
), f"Expected the new train set to have the same columns as the original train set, but it didn't! Got {set(new_train.columns)} != {set(old_train.columns)}."
|
|
58
|
+
|
|
59
|
+
assert set(new_sample_submission.columns) == set(
|
|
60
|
+
old_sample_submission.columns
|
|
61
|
+
), f"Expected the new sample submission to have the same columns as the original sample submission, but it didn't! Got {set(new_sample_submission.columns)} != {set(old_sample_submission.columns)}."
|
|
62
|
+
|
|
63
|
+
assert len(list(public.glob("train_images/*/*.jpg"))) == len(
|
|
64
|
+
new_train
|
|
65
|
+
), f"Expected the number of images in the `public / train_images` directory to match the number of rows in the `public / train.csv` file, but it didn't! Got {len(list(public.glob('train_images/*/*.jpg')))} != {len(new_train)}."
|
|
66
|
+
|
|
67
|
+
assert len(list(public.glob("test_images/*.jpg"))) == len(
|
|
68
|
+
new_test
|
|
69
|
+
), f"Expected the number of images in the `public / test_images` directory to match the number of rows in the `private / test.csv` file, but it didn't! Got {len(list(public.glob('test_images/*.jpg')))} != {len(new_test)}."
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
import shutil
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from sklearn.model_selection import train_test_split
|
|
6
|
+
|
|
7
|
+
from mlebench.utils import read_csv
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def copy(src: Path, dst: Path):
|
|
11
|
+
"""A wrapper for `shutil.copy` which creates destination directories when they don't exist."""
|
|
12
|
+
|
|
13
|
+
assert src.exists(), f"{src} does not exist"
|
|
14
|
+
# Allow overwriting for simplicity in rerunning the script
|
|
15
|
+
if dst.exists():
|
|
16
|
+
if dst.is_dir():
|
|
17
|
+
shutil.rmtree(dst)
|
|
18
|
+
else:
|
|
19
|
+
dst.unlink()
|
|
20
|
+
dst.parent.mkdir(parents=True, exist_ok=True)
|
|
21
|
+
shutil.copy(src, dst)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _process_and_save_split(
|
|
25
|
+
train_df: pd.DataFrame,
|
|
26
|
+
test_df: pd.DataFrame,
|
|
27
|
+
public_path: Path,
|
|
28
|
+
private_path: Path,
|
|
29
|
+
raw_path: Path,
|
|
30
|
+
old_sample_submission: pd.DataFrame,
|
|
31
|
+
):
|
|
32
|
+
"""
|
|
33
|
+
Helper function to process a single train/test split and save the results.
|
|
34
|
+
This encapsulates the logic for creating CSVs, copying images, and running checks.
|
|
35
|
+
"""
|
|
36
|
+
sample_submission = test_df[["image_id", "label"]].copy()
|
|
37
|
+
sample_submission["label"] = ""
|
|
38
|
+
|
|
39
|
+
# Ensure output directories exist
|
|
40
|
+
public_path.mkdir(exist_ok=True)
|
|
41
|
+
private_path.mkdir(exist_ok=True)
|
|
42
|
+
|
|
43
|
+
train_df.to_csv(public_path / "train.csv", index=False)
|
|
44
|
+
sample_submission.to_csv(public_path / "sample_submission.csv", index=False)
|
|
45
|
+
test_df.to_csv(private_path / "test.csv", index=False)
|
|
46
|
+
|
|
47
|
+
for row in train_df.itertuples():
|
|
48
|
+
copy(
|
|
49
|
+
raw_path / "train_images" / row.label / row.image_id,
|
|
50
|
+
public_path / "train_images" / row.label / row.image_id,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
for row in test_df.itertuples():
|
|
54
|
+
copy(
|
|
55
|
+
raw_path / "train_images" / row.label / row.image_id,
|
|
56
|
+
public_path / "test_images" / row.image_id,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# Sanity checks (generalized for any split)
|
|
60
|
+
train_image_ids = set(train_df.image_id)
|
|
61
|
+
test_image_ids = set(test_df.image_id)
|
|
62
|
+
|
|
63
|
+
assert train_image_ids.isdisjoint(test_image_ids), f"Train and test sets overlap in {public_path}!"
|
|
64
|
+
|
|
65
|
+
assert set(train_df.columns) == set(
|
|
66
|
+
test_df.columns
|
|
67
|
+
), f"Expected the new train and test sets to have the same columns, but they didn't! Got {set(train_df.columns)} != {set(test_df.columns)} in {public_path}."
|
|
68
|
+
|
|
69
|
+
assert set(sample_submission.columns) == set(
|
|
70
|
+
old_sample_submission.columns
|
|
71
|
+
), f"Expected the new sample submission to have the same columns as the original sample submission, but it didn't! Got {set(sample_submission.columns)} != {set(old_sample_submission.columns)} in {public_path}."
|
|
72
|
+
|
|
73
|
+
assert len(list(public_path.glob("train_images/*/*.jpg"))) == len(
|
|
74
|
+
train_df
|
|
75
|
+
), f"Expected the number of images in the `{public_path / 'train_images'}` directory to match the number of rows in the `{public_path / 'train.csv'}` file, but it didn't! Got {len(list(public_path.glob('train_images/*/*.jpg')))} != {len(train_df)}."
|
|
76
|
+
|
|
77
|
+
assert len(list(public_path.glob("test_images/*.jpg"))) == len(
|
|
78
|
+
test_df
|
|
79
|
+
), f"Expected the number of images in the `{public_path / 'test_images'}` directory to match the number of rows in the `{private_path / 'test.csv'}` file, but it didn't! Got {len(list(public_path.glob('test_images/*.jpg')))} != {len(test_df)}."
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def prepare(raw: Path, public: Path, private: Path):
|
|
83
|
+
old_train = read_csv(raw / "train.csv")
|
|
84
|
+
old_sample_submission = read_csv(raw / "sample_submission.csv")
|
|
85
|
+
|
|
86
|
+
# The original dataset has 10,407 train images and 3,469 test images.
|
|
87
|
+
# This implies a 75%/25% train/test split.
|
|
88
|
+
new_train, new_test = train_test_split(old_train, train_size=0.75, random_state=0)
|
|
89
|
+
|
|
90
|
+
# Process and save the original split to `public` and `private` directories
|
|
91
|
+
_process_and_save_split(
|
|
92
|
+
train_df=new_train,
|
|
93
|
+
test_df=new_test,
|
|
94
|
+
public_path=public,
|
|
95
|
+
private_path=private,
|
|
96
|
+
raw_path=raw,
|
|
97
|
+
old_sample_submission=old_sample_submission,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# Sanity check specific to the first split
|
|
101
|
+
assert len(new_train) + len(new_test) == len(
|
|
102
|
+
old_train
|
|
103
|
+
), f"Expected the combined size of the new train and test sets to be the same size as the original train set, but it wasn't! Got {len(new_train) + len(new_test)} != {len(old_train)}."
|
|
104
|
+
|
|
105
|
+
# --- New Validation Set Creation ---
|
|
106
|
+
# Define paths for the new validation split
|
|
107
|
+
public_val = public.parent / "public_val"
|
|
108
|
+
private_val = private.parent / "private_val"
|
|
109
|
+
|
|
110
|
+
# Perform a second split on the `new_train` set created above.
|
|
111
|
+
# Use the exact same splitting parameters to replicate the logic.
|
|
112
|
+
train_val, test_val = train_test_split(new_train, train_size=0.75, random_state=0)
|
|
113
|
+
|
|
114
|
+
# Process and save the new validation split to `public_val` and `private_val`
|
|
115
|
+
_process_and_save_split(
|
|
116
|
+
train_df=train_val,
|
|
117
|
+
test_df=test_val,
|
|
118
|
+
public_path=public_val,
|
|
119
|
+
private_path=private_val,
|
|
120
|
+
raw_path=raw,
|
|
121
|
+
old_sample_submission=old_sample_submission,
|
|
122
|
+
)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from sklearn.metrics import root_mean_squared_error
|
|
3
|
+
|
|
4
|
+
from mlebench.grade_helpers import InvalidSubmissionError
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame) -> dict:
|
|
8
|
+
id_col = "Id"
|
|
9
|
+
target_col = "Pawpularity"
|
|
10
|
+
|
|
11
|
+
assert id_col in answers.columns, f"Missing {id_col} column in answers"
|
|
12
|
+
assert target_col in answers.columns, f"Missing {target_col} column in answers"
|
|
13
|
+
assert (
|
|
14
|
+
answers[target_col].between(1, 100).all()
|
|
15
|
+
), f"Pawpularity in answers should be between 1 and 100"
|
|
16
|
+
|
|
17
|
+
if id_col not in submission.columns:
|
|
18
|
+
raise InvalidSubmissionError(f"Missing {id_col} column in submission")
|
|
19
|
+
|
|
20
|
+
if target_col not in submission.columns:
|
|
21
|
+
raise InvalidSubmissionError(f"Missing {target_col} column in submission")
|
|
22
|
+
|
|
23
|
+
if not submission[target_col].between(1, 100).all():
|
|
24
|
+
raise InvalidSubmissionError(f"Pawpularity in submission should be between 1 and 100")
|
|
25
|
+
|
|
26
|
+
submission = submission.sort_values(id_col)
|
|
27
|
+
answers = answers.sort_values(id_col)
|
|
28
|
+
|
|
29
|
+
if (submission[id_col].values != answers[id_col].values).any():
|
|
30
|
+
raise InvalidSubmissionError(f"IDs in submission do not match IDs in answers")
|
|
31
|
+
|
|
32
|
+
return {
|
|
33
|
+
"y_true": answers[target_col].to_numpy(),
|
|
34
|
+
"y_pred": submission[target_col].to_numpy(),
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
|
|
39
|
+
rmse_input = prepare_for_metric(submission, answers)
|
|
40
|
+
score = root_mean_squared_error(**rmse_input)
|
|
41
|
+
return score
|