dslighting 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dslighting/__init__.py +1 -1
- dslighting/core/agent.py +78 -62
- {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/METADATA +3 -1
- {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/RECORD +352 -7
- {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/top_level.txt +1 -0
- mlebench/README.md +39 -0
- mlebench/__init__.py +0 -0
- mlebench/cli.py +221 -0
- mlebench/competitions/3d-object-detection-for-autonomous-vehicles/grade.py +161 -0
- mlebench/competitions/3d-object-detection-for-autonomous-vehicles/mAP_evaluation.py +425 -0
- mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare.py +483 -0
- mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare_val.py +719 -0
- mlebench/competitions/AI4Code/grade.py +70 -0
- mlebench/competitions/AI4Code/prepare.py +84 -0
- mlebench/competitions/AI4Code/prepare_val.py +159 -0
- mlebench/competitions/__init__.py +0 -0
- mlebench/competitions/aerial-cactus-identification/grade.py +11 -0
- mlebench/competitions/aerial-cactus-identification/prepare.py +71 -0
- mlebench/competitions/aerial-cactus-identification/prepare_val.py +133 -0
- mlebench/competitions/alaska2-image-steganalysis/grade.py +136 -0
- mlebench/competitions/alaska2-image-steganalysis/prepare.py +88 -0
- mlebench/competitions/alaska2-image-steganalysis/prepare_val.py +148 -0
- mlebench/competitions/aptos2019-blindness-detection/grade.py +35 -0
- mlebench/competitions/aptos2019-blindness-detection/prepare.py +75 -0
- mlebench/competitions/aptos2019-blindness-detection/prepare_val.py +123 -0
- mlebench/competitions/bike-sharing-demand/__init__.py +0 -0
- mlebench/competitions/bike-sharing-demand/grade.py +55 -0
- mlebench/competitions/bike-sharing-demand/prepare.py +37 -0
- mlebench/competitions/billion-word-imputation/grade.py +37 -0
- mlebench/competitions/billion-word-imputation/prepare.py +107 -0
- mlebench/competitions/billion-word-imputation/prepare_val.py +179 -0
- mlebench/competitions/bms-molecular-translation/grade.py +40 -0
- mlebench/competitions/bms-molecular-translation/prepare.py +68 -0
- mlebench/competitions/bms-molecular-translation/prepare_val.py +131 -0
- mlebench/competitions/cassava-leaf-disease-classification/grade.py +12 -0
- mlebench/competitions/cassava-leaf-disease-classification/prepare.py +113 -0
- mlebench/competitions/cassava-leaf-disease-classification/prepare_val.py +186 -0
- mlebench/competitions/cdiscount-image-classification-challenge/grade.py +11 -0
- mlebench/competitions/cdiscount-image-classification-challenge/prepare.py +144 -0
- mlebench/competitions/cdiscount-image-classification-challenge/prepare_val.py +205 -0
- mlebench/competitions/chaii-hindi-and-tamil-question-answering/grade.py +67 -0
- mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare.py +31 -0
- mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare_val.py +94 -0
- mlebench/competitions/champs-scalar-coupling/grade.py +60 -0
- mlebench/competitions/champs-scalar-coupling/prepare.py +116 -0
- mlebench/competitions/champs-scalar-coupling/prepare_val.py +155 -0
- mlebench/competitions/conways-reverse-game-of-life-2020/__init__.py +0 -0
- mlebench/competitions/conways-reverse-game-of-life-2020/grade.py +40 -0
- mlebench/competitions/conways-reverse-game-of-life-2020/prepare.py +41 -0
- mlebench/competitions/demand-forecasting-kernels-only/__init__.py +0 -0
- mlebench/competitions/demand-forecasting-kernels-only/grade.py +66 -0
- mlebench/competitions/demand-forecasting-kernels-only/prepare.py +27 -0
- mlebench/competitions/demand_forecasting_kernels_only/__init__.py +0 -0
- mlebench/competitions/demand_forecasting_kernels_only/grade.py +66 -0
- mlebench/competitions/demand_forecasting_kernels_only/prepare.py +27 -0
- mlebench/competitions/denoising-dirty-documents/grade.py +44 -0
- mlebench/competitions/denoising-dirty-documents/prepare.py +134 -0
- mlebench/competitions/denoising-dirty-documents/prepare_val.py +178 -0
- mlebench/competitions/detecting-insults-in-social-commentary/grade.py +11 -0
- mlebench/competitions/detecting-insults-in-social-commentary/prepare.py +72 -0
- mlebench/competitions/detecting-insults-in-social-commentary/prepare_val.py +128 -0
- mlebench/competitions/dog-breed-identification/dogs.py +124 -0
- mlebench/competitions/dog-breed-identification/grade.py +42 -0
- mlebench/competitions/dog-breed-identification/prepare.py +55 -0
- mlebench/competitions/dog-breed-identification/prepare_val.py +104 -0
- mlebench/competitions/dogs-vs-cats-redux-kernels-edition/grade.py +43 -0
- mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare.py +70 -0
- mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare_val.py +143 -0
- mlebench/competitions/ethanol-concentration/grade.py +23 -0
- mlebench/competitions/ethanol-concentration/prepare.py +90 -0
- mlebench/competitions/facebook-recruiting-iii-keyword-extraction/grade.py +60 -0
- mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare.py +41 -0
- mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare_val.py +92 -0
- mlebench/competitions/feedback-prize-english-language-learning/__init__.py +0 -0
- mlebench/competitions/feedback-prize-english-language-learning/grade.py +60 -0
- mlebench/competitions/feedback-prize-english-language-learning/prepare.py +39 -0
- mlebench/competitions/freesound-audio-tagging-2019/grade.py +64 -0
- mlebench/competitions/freesound-audio-tagging-2019/prepare.py +94 -0
- mlebench/competitions/freesound-audio-tagging-2019/prepare_val.py +175 -0
- mlebench/competitions/freesound-audio-tagging-2019/vocabulary.py +83 -0
- mlebench/competitions/google-quest-challenge/classes.py +32 -0
- mlebench/competitions/google-quest-challenge/grade.py +45 -0
- mlebench/competitions/google-quest-challenge/prepare.py +58 -0
- mlebench/competitions/google-quest-challenge/prepare_val.py +120 -0
- mlebench/competitions/google-research-identify-contrails-reduce-global-warming/grade.py +77 -0
- mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare.py +155 -0
- mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare_val.py +211 -0
- mlebench/competitions/h-and-m-personalized-fashion-recommendations/grade.py +42 -0
- mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare.py +102 -0
- mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare_val.py +132 -0
- mlebench/competitions/handwriting/grade.py +23 -0
- mlebench/competitions/handwriting/prepare.py +179 -0
- mlebench/competitions/herbarium-2020-fgvc7/grade.py +34 -0
- mlebench/competitions/herbarium-2020-fgvc7/prepare.py +251 -0
- mlebench/competitions/herbarium-2020-fgvc7/prepare_val.py +242 -0
- mlebench/competitions/herbarium-2021-fgvc8/grade.py +34 -0
- mlebench/competitions/herbarium-2021-fgvc8/prepare.py +251 -0
- mlebench/competitions/herbarium-2021-fgvc8/prepare_val.py +222 -0
- mlebench/competitions/herbarium-2022-fgvc9/grade.py +31 -0
- mlebench/competitions/herbarium-2022-fgvc9/prepare.py +233 -0
- mlebench/competitions/herbarium-2022-fgvc9/prepare_val.py +213 -0
- mlebench/competitions/histopathologic-cancer-detection/grade.py +12 -0
- mlebench/competitions/histopathologic-cancer-detection/prepare.py +59 -0
- mlebench/competitions/histopathologic-cancer-detection/prepare_val.py +131 -0
- mlebench/competitions/hms-harmful-brain-activity-classification/constants.py +9 -0
- mlebench/competitions/hms-harmful-brain-activity-classification/grade.py +43 -0
- mlebench/competitions/hms-harmful-brain-activity-classification/kaggle_metric_utilities.py +96 -0
- mlebench/competitions/hms-harmful-brain-activity-classification/kullback_leibler_divergence.py +118 -0
- mlebench/competitions/hms-harmful-brain-activity-classification/prepare.py +121 -0
- mlebench/competitions/hms-harmful-brain-activity-classification/prepare_val.py +190 -0
- mlebench/competitions/hotel-id-2021-fgvc8/grade.py +41 -0
- mlebench/competitions/hotel-id-2021-fgvc8/prepare.py +63 -0
- mlebench/competitions/hotel-id-2021-fgvc8/prepare_val.py +132 -0
- mlebench/competitions/hubmap-kidney-segmentation/grade.py +62 -0
- mlebench/competitions/hubmap-kidney-segmentation/prepare.py +108 -0
- mlebench/competitions/hubmap-kidney-segmentation/prepare_val.py +153 -0
- mlebench/competitions/icecube-neutrinos-in-deep-ice/grade.py +111 -0
- mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare.py +127 -0
- mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare_val.py +183 -0
- mlebench/competitions/ili/grade.py +60 -0
- mlebench/competitions/ili/prepare.py +99 -0
- mlebench/competitions/imet-2020-fgvc7/grade.py +54 -0
- mlebench/competitions/imet-2020-fgvc7/prepare.py +77 -0
- mlebench/competitions/imet-2020-fgvc7/prepare_val.py +157 -0
- mlebench/competitions/inaturalist-2019-fgvc6/grade.py +35 -0
- mlebench/competitions/inaturalist-2019-fgvc6/prepare.py +259 -0
- mlebench/competitions/inaturalist-2019-fgvc6/prepare_val.py +304 -0
- mlebench/competitions/instant-gratification/__init__.py +0 -0
- mlebench/competitions/instant-gratification/grade.py +55 -0
- mlebench/competitions/instant-gratification/prepare.py +25 -0
- mlebench/competitions/instant_gratification/__init__.py +0 -0
- mlebench/competitions/instant_gratification/grade.py +55 -0
- mlebench/competitions/instant_gratification/prepare.py +25 -0
- mlebench/competitions/invasive-species-monitoring/grade.py +11 -0
- mlebench/competitions/invasive-species-monitoring/prepare.py +97 -0
- mlebench/competitions/invasive-species-monitoring/prepare_val.py +164 -0
- mlebench/competitions/iwildcam-2019-fgvc6/grade.py +44 -0
- mlebench/competitions/iwildcam-2019-fgvc6/prepare.py +118 -0
- mlebench/competitions/iwildcam-2019-fgvc6/prepare_val.py +194 -0
- mlebench/competitions/iwildcam-2020-fgvc7/grade.py +11 -0
- mlebench/competitions/iwildcam-2020-fgvc7/prepare.py +164 -0
- mlebench/competitions/iwildcam-2020-fgvc7/prepare_val.py +245 -0
- mlebench/competitions/jigsaw-toxic-comment-classification-challenge/classes.py +1 -0
- mlebench/competitions/jigsaw-toxic-comment-classification-challenge/grade.py +54 -0
- mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare.py +42 -0
- mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare_val.py +88 -0
- mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/grade.py +153 -0
- mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare.py +36 -0
- mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare_val.py +117 -0
- mlebench/competitions/kuzushiji-recognition/grade.py +58 -0
- mlebench/competitions/kuzushiji-recognition/kuzushiji_metric.py +118 -0
- mlebench/competitions/kuzushiji-recognition/prepare.py +92 -0
- mlebench/competitions/kuzushiji-recognition/prepare_val.py +149 -0
- mlebench/competitions/leaf-classification/classes.py +101 -0
- mlebench/competitions/leaf-classification/grade.py +44 -0
- mlebench/competitions/leaf-classification/prepare.py +60 -0
- mlebench/competitions/leaf-classification/prepare_val.py +116 -0
- mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/grade.py +44 -0
- mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare.py +51 -0
- mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare_val.py +96 -0
- mlebench/competitions/liverpool-ion-switching/__init__.py +0 -0
- mlebench/competitions/liverpool-ion-switching/grade.py +52 -0
- mlebench/competitions/liverpool-ion-switching/prepare.py +27 -0
- mlebench/competitions/liverpool_ion_switching/__init__.py +0 -0
- mlebench/competitions/liverpool_ion_switching/grade.py +52 -0
- mlebench/competitions/liverpool_ion_switching/prepare.py +27 -0
- mlebench/competitions/lmsys-chatbot-arena/grade.py +63 -0
- mlebench/competitions/lmsys-chatbot-arena/prepare.py +52 -0
- mlebench/competitions/lmsys-chatbot-arena/prepare_val.py +115 -0
- mlebench/competitions/mcm_2024_c_test/grade.py +107 -0
- mlebench/competitions/mcm_2024_c_test/prepare.py +2 -0
- mlebench/competitions/ml2021spring-hw2/grade.py +11 -0
- mlebench/competitions/ml2021spring-hw2/prepare.py +58 -0
- mlebench/competitions/ml2021spring-hw2/prepare_val.py +135 -0
- mlebench/competitions/mlsp-2013-birds/grade.py +11 -0
- mlebench/competitions/mlsp-2013-birds/prepare.py +182 -0
- mlebench/competitions/mlsp-2013-birds/prepare_val.py +241 -0
- mlebench/competitions/movie-review-sentiment-analysis-kernels-only/grade.py +11 -0
- mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare.py +58 -0
- mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare_val.py +120 -0
- mlebench/competitions/multi-modal-gesture-recognition/grade.py +58 -0
- mlebench/competitions/multi-modal-gesture-recognition/prepare.py +85 -0
- mlebench/competitions/multi-modal-gesture-recognition/prepare_val.py +139 -0
- mlebench/competitions/my-custom-task-01/prepare.py +2 -0
- mlebench/competitions/new-my-task-01/prepare.py +2 -0
- mlebench/competitions/new-my-task-03/grade.py +107 -0
- mlebench/competitions/new-my-task-03/prepare.py +2 -0
- mlebench/competitions/new-york-city-taxi-fare-prediction/grade.py +28 -0
- mlebench/competitions/new-york-city-taxi-fare-prediction/prepare.py +44 -0
- mlebench/competitions/new-york-city-taxi-fare-prediction/prepare_val.py +89 -0
- mlebench/competitions/nfl-player-contact-detection/grade.py +36 -0
- mlebench/competitions/nfl-player-contact-detection/prepare.py +101 -0
- mlebench/competitions/nfl-player-contact-detection/prepare_val.py +186 -0
- mlebench/competitions/nomad2018-predict-transparent-conductors/grade.py +47 -0
- mlebench/competitions/nomad2018-predict-transparent-conductors/prepare.py +77 -0
- mlebench/competitions/nomad2018-predict-transparent-conductors/prepare_val.py +144 -0
- mlebench/competitions/osic-pulmonary-fibrosis-progression/grade.py +74 -0
- mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare.py +95 -0
- mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare_val.py +167 -0
- mlebench/competitions/paddy-disease-classification/grade.py +35 -0
- mlebench/competitions/paddy-disease-classification/prepare.py +69 -0
- mlebench/competitions/paddy-disease-classification/prepare_val.py +122 -0
- mlebench/competitions/petfinder-pawpularity-score/grade.py +41 -0
- mlebench/competitions/petfinder-pawpularity-score/prepare.py +76 -0
- mlebench/competitions/petfinder-pawpularity-score/prepare_val.py +154 -0
- mlebench/competitions/plant-pathology-2020-fgvc7/grade.py +41 -0
- mlebench/competitions/plant-pathology-2020-fgvc7/prepare.py +74 -0
- mlebench/competitions/plant-pathology-2020-fgvc7/prepare_val.py +160 -0
- mlebench/competitions/plant-pathology-2021-fgvc8/grade.py +54 -0
- mlebench/competitions/plant-pathology-2021-fgvc8/prepare.py +65 -0
- mlebench/competitions/plant-pathology-2021-fgvc8/prepare_val.py +130 -0
- mlebench/competitions/plant-seedlings-classification/grade.py +39 -0
- mlebench/competitions/plant-seedlings-classification/prepare.py +91 -0
- mlebench/competitions/plant-seedlings-classification/prepare_val.py +158 -0
- mlebench/competitions/playground-series-s3e1/__init__.py +0 -0
- mlebench/competitions/playground-series-s3e1/grade.py +52 -0
- mlebench/competitions/playground-series-s3e1/prepare.py +25 -0
- mlebench/competitions/playground-series-s3e11/__init__.py +0 -0
- mlebench/competitions/playground-series-s3e11/grade.py +55 -0
- mlebench/competitions/playground-series-s3e11/prepare.py +25 -0
- mlebench/competitions/playground-series-s3e18/grade.py +39 -0
- mlebench/competitions/playground-series-s3e18/prepare.py +36 -0
- mlebench/competitions/playground-series-s3e18/prepare_val.py +89 -0
- mlebench/competitions/playground_series_s3e1/__init__.py +0 -0
- mlebench/competitions/playground_series_s3e1/grade.py +52 -0
- mlebench/competitions/playground_series_s3e1/prepare.py +25 -0
- mlebench/competitions/playground_series_s3e11/__init__.py +0 -0
- mlebench/competitions/playground_series_s3e11/grade.py +55 -0
- mlebench/competitions/playground_series_s3e11/prepare.py +25 -0
- mlebench/competitions/predict-volcanic-eruptions-ingv-oe/grade.py +44 -0
- mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare.py +68 -0
- mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare_val.py +146 -0
- mlebench/competitions/random-acts-of-pizza/grade.py +14 -0
- mlebench/competitions/random-acts-of-pizza/prepare.py +80 -0
- mlebench/competitions/random-acts-of-pizza/prepare_val.py +144 -0
- mlebench/competitions/ranzcr-clip-catheter-line-classification/classes.py +11 -0
- mlebench/competitions/ranzcr-clip-catheter-line-classification/grade.py +31 -0
- mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare.py +53 -0
- mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare_val.py +113 -0
- mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/grade.py +124 -0
- mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare.py +219 -0
- mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare_val.py +257 -0
- mlebench/competitions/rsna-breast-cancer-detection/grade.py +65 -0
- mlebench/competitions/rsna-breast-cancer-detection/prepare.py +141 -0
- mlebench/competitions/rsna-breast-cancer-detection/prepare_val.py +201 -0
- mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/grade.py +13 -0
- mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare.py +47 -0
- mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare_val.py +97 -0
- mlebench/competitions/santander-customer-satisfaction/grade.py +10 -0
- mlebench/competitions/santander-customer-satisfaction/prepare.py +41 -0
- mlebench/competitions/sciencebench-001-clintox-nn/__init__.py +0 -0
- mlebench/competitions/sciencebench-001-clintox-nn/grade.py +56 -0
- mlebench/competitions/sciencebench-001-clintox-nn/prepare.py +75 -0
- mlebench/competitions/sciencebench-015-aai/grade.py +37 -0
- mlebench/competitions/sciencebench-015-aai/prepare.py +102 -0
- mlebench/competitions/sciencebench-051-brain-blood-qsar/grade.py +58 -0
- mlebench/competitions/sciencebench-051-brain-blood-qsar/prepare.py +69 -0
- mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/grade.py +55 -0
- mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/prepare.py +88 -0
- mlebench/competitions/see-click-predict-fix/__init__.py +0 -0
- mlebench/competitions/see-click-predict-fix/grade.py +66 -0
- mlebench/competitions/see-click-predict-fix/prepare.py +25 -0
- mlebench/competitions/see_click_predict_fix/__init__.py +0 -0
- mlebench/competitions/see_click_predict_fix/grade.py +66 -0
- mlebench/competitions/see_click_predict_fix/prepare.py +25 -0
- mlebench/competitions/seti-breakthrough-listen/grade.py +11 -0
- mlebench/competitions/seti-breakthrough-listen/prepare.py +71 -0
- mlebench/competitions/seti-breakthrough-listen/prepare_val.py +159 -0
- mlebench/competitions/siim-covid19-detection/grade.py +194 -0
- mlebench/competitions/siim-covid19-detection/prepare.py +123 -0
- mlebench/competitions/siim-covid19-detection/prepare_val.py +164 -0
- mlebench/competitions/siim-isic-melanoma-classification/grade.py +11 -0
- mlebench/competitions/siim-isic-melanoma-classification/prepare.py +127 -0
- mlebench/competitions/siim-isic-melanoma-classification/prepare_val.py +158 -0
- mlebench/competitions/smartphone-decimeter-2022/grade.py +55 -0
- mlebench/competitions/smartphone-decimeter-2022/notebook.py +86 -0
- mlebench/competitions/smartphone-decimeter-2022/prepare.py +143 -0
- mlebench/competitions/smartphone-decimeter-2022/prepare_val.py +199 -0
- mlebench/competitions/spaceship-titanic/grade.py +11 -0
- mlebench/competitions/spaceship-titanic/prepare.py +23 -0
- mlebench/competitions/spaceship-titanic/prepare_val.py +61 -0
- mlebench/competitions/spooky-author-identification/classes.py +1 -0
- mlebench/competitions/spooky-author-identification/grade.py +38 -0
- mlebench/competitions/spooky-author-identification/prepare.py +40 -0
- mlebench/competitions/spooky-author-identification/prepare_val.py +78 -0
- mlebench/competitions/stanford-covid-vaccine/grade.py +65 -0
- mlebench/competitions/stanford-covid-vaccine/prepare.py +129 -0
- mlebench/competitions/stanford-covid-vaccine/prepare_val.py +199 -0
- mlebench/competitions/statoil-iceberg-classifier-challenge/grade.py +41 -0
- mlebench/competitions/statoil-iceberg-classifier-challenge/prepare.py +105 -0
- mlebench/competitions/statoil-iceberg-classifier-challenge/prepare_val.py +157 -0
- mlebench/competitions/tabular-playground-series-dec-2021/grade.py +11 -0
- mlebench/competitions/tabular-playground-series-dec-2021/prepare.py +39 -0
- mlebench/competitions/tabular-playground-series-dec-2021/prepare_val.py +99 -0
- mlebench/competitions/tabular-playground-series-may-2022/grade.py +9 -0
- mlebench/competitions/tabular-playground-series-may-2022/prepare.py +56 -0
- mlebench/competitions/tabular-playground-series-may-2022/prepare_val.py +116 -0
- mlebench/competitions/tensorflow-speech-recognition-challenge/grade.py +11 -0
- mlebench/competitions/tensorflow-speech-recognition-challenge/prepare.py +90 -0
- mlebench/competitions/tensorflow-speech-recognition-challenge/prepare_val.py +148 -0
- mlebench/competitions/tensorflow2-question-answering/grade.py +122 -0
- mlebench/competitions/tensorflow2-question-answering/prepare.py +122 -0
- mlebench/competitions/tensorflow2-question-answering/prepare_val.py +187 -0
- mlebench/competitions/text-normalization-challenge-english-language/grade.py +49 -0
- mlebench/competitions/text-normalization-challenge-english-language/prepare.py +115 -0
- mlebench/competitions/text-normalization-challenge-english-language/prepare_val.py +213 -0
- mlebench/competitions/text-normalization-challenge-russian-language/grade.py +49 -0
- mlebench/competitions/text-normalization-challenge-russian-language/prepare.py +113 -0
- mlebench/competitions/text-normalization-challenge-russian-language/prepare_val.py +165 -0
- mlebench/competitions/tgs-salt-identification-challenge/grade.py +144 -0
- mlebench/competitions/tgs-salt-identification-challenge/prepare.py +158 -0
- mlebench/competitions/tgs-salt-identification-challenge/prepare_val.py +166 -0
- mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/grade.py +11 -0
- mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare.py +95 -0
- mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare_val.py +141 -0
- mlebench/competitions/tmdb-box-office-prediction/__init__.py +0 -0
- mlebench/competitions/tmdb-box-office-prediction/grade.py +55 -0
- mlebench/competitions/tmdb-box-office-prediction/prepare.py +35 -0
- mlebench/competitions/tweet-sentiment-extraction/grade.py +67 -0
- mlebench/competitions/tweet-sentiment-extraction/prepare.py +36 -0
- mlebench/competitions/tweet-sentiment-extraction/prepare_val.py +106 -0
- mlebench/competitions/us-patent-phrase-to-phrase-matching/grade.py +31 -0
- mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare.py +33 -0
- mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare_val.py +71 -0
- mlebench/competitions/utils.py +266 -0
- mlebench/competitions/uw-madison-gi-tract-image-segmentation/grade.py +158 -0
- mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare.py +139 -0
- mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare_val.py +193 -0
- mlebench/competitions/ventilator-pressure-prediction/__init__.py +0 -0
- mlebench/competitions/ventilator-pressure-prediction/grade.py +52 -0
- mlebench/competitions/ventilator-pressure-prediction/prepare.py +27 -0
- mlebench/competitions/ventilator-pressure-prediction/prepare_val.py +142 -0
- mlebench/competitions/ventilator_pressure_prediction/__init__.py +0 -0
- mlebench/competitions/ventilator_pressure_prediction/grade.py +52 -0
- mlebench/competitions/ventilator_pressure_prediction/prepare.py +27 -0
- mlebench/competitions/vesuvius-challenge-ink-detection/grade.py +97 -0
- mlebench/competitions/vesuvius-challenge-ink-detection/prepare.py +122 -0
- mlebench/competitions/vesuvius-challenge-ink-detection/prepare_val.py +170 -0
- mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/grade.py +220 -0
- mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare.py +129 -0
- mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare_val.py +204 -0
- mlebench/competitions/whale-categorization-playground/grade.py +41 -0
- mlebench/competitions/whale-categorization-playground/prepare.py +103 -0
- mlebench/competitions/whale-categorization-playground/prepare_val.py +196 -0
- mlebench/data.py +420 -0
- mlebench/grade.py +209 -0
- mlebench/grade_helpers.py +235 -0
- mlebench/metrics.py +75 -0
- mlebench/registry.py +332 -0
- mlebench/utils.py +346 -0
- {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/WHEEL +0 -0
- {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import random
|
|
3
|
+
import shutil
|
|
4
|
+
import tarfile
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from sklearn.model_selection import train_test_split
|
|
9
|
+
from tqdm import tqdm
|
|
10
|
+
|
|
11
|
+
from mlebench.utils import get_logger
|
|
12
|
+
|
|
13
|
+
logger = get_logger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def add_to_tar(src: Path, out: Path):
|
|
17
|
+
|
|
18
|
+
assert src.exists(), f"Source directory `{src}` does not exist."
|
|
19
|
+
assert src.is_dir(), f"Expected a directory, but got `{src}`."
|
|
20
|
+
|
|
21
|
+
tqdm_desc = f"Taring {src.name} to {out.name}"
|
|
22
|
+
file_paths = [path for path in src.rglob("*") if path.is_file()]
|
|
23
|
+
total_files = len(file_paths)
|
|
24
|
+
|
|
25
|
+
with tarfile.open(out, "w") as tar:
|
|
26
|
+
for file_path in tqdm(file_paths, desc=tqdm_desc, unit="file", total=total_files):
|
|
27
|
+
tar.add(file_path, arcname=file_path.relative_to(src))
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def prepare(raw: Path, public: Path, private: Path):
|
|
31
|
+
|
|
32
|
+
dev_mode = False
|
|
33
|
+
image_count = 2 if dev_mode else float("inf") # We copy over 2 images per category for dev mode
|
|
34
|
+
|
|
35
|
+
# Extract train_val2019.tar.gz which contains images
|
|
36
|
+
train_tar_path = raw / "train_val2019.tar.gz"
|
|
37
|
+
train_extract_path = raw
|
|
38
|
+
if not (raw / "train_val2019").exists():
|
|
39
|
+
shutil.unpack_archive(train_tar_path, train_extract_path)
|
|
40
|
+
|
|
41
|
+
# Create train, test from train split
|
|
42
|
+
json_path = raw / "train2019.json"
|
|
43
|
+
with open(json_path, "r", encoding="utf-8") as f:
|
|
44
|
+
old_train_metadata = json.load(f)
|
|
45
|
+
|
|
46
|
+
# Organize data by category so that we can split per-category later
|
|
47
|
+
annotation_image_metadata_by_category = {} # We'll collect both `annotations` and `images` here
|
|
48
|
+
for annotation_info, image_info in list(
|
|
49
|
+
zip(old_train_metadata["annotations"], old_train_metadata["images"])
|
|
50
|
+
):
|
|
51
|
+
assert (
|
|
52
|
+
annotation_info["image_id"] == image_info["id"]
|
|
53
|
+
), f"Mismatching image_id in annotation and image: {annotation_info['image_id']} vs {image_info['id']}"
|
|
54
|
+
category_id = annotation_info["category_id"]
|
|
55
|
+
if category_id not in annotation_image_metadata_by_category:
|
|
56
|
+
annotation_image_metadata_by_category[category_id] = []
|
|
57
|
+
annotation_image_metadata_by_category[category_id].append(
|
|
58
|
+
{
|
|
59
|
+
"annotation": annotation_info,
|
|
60
|
+
"image": image_info,
|
|
61
|
+
}
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# Split train/test
|
|
65
|
+
train_sample_count = 0 # Useful for tqdm later
|
|
66
|
+
train_annotation_image_metadata_by_category = {}
|
|
67
|
+
test_annotation_image_metadata_by_category = {}
|
|
68
|
+
|
|
69
|
+
for category_id, annotation_image_metadata in tqdm(
|
|
70
|
+
annotation_image_metadata_by_category.items(), desc="Assigning train/test splits"
|
|
71
|
+
):
|
|
72
|
+
# Create split by "category" (class)
|
|
73
|
+
# Original train+val has 268,243 images, test has 35,400 images, 0.12 ratio
|
|
74
|
+
test_size = 0.12
|
|
75
|
+
n_samples = len(annotation_image_metadata)
|
|
76
|
+
if n_samples == 1:
|
|
77
|
+
# If only one sample, put it in train
|
|
78
|
+
train_annotations_images = annotation_image_metadata
|
|
79
|
+
test_annotations_images = []
|
|
80
|
+
elif n_samples < 5: # Minimum 5 samples to ensure at least 1 in test
|
|
81
|
+
num_test_samples = max(1, int(n_samples * test_size))
|
|
82
|
+
train_annotations_images = annotation_image_metadata[:-num_test_samples]
|
|
83
|
+
test_annotations_images = annotation_image_metadata[-num_test_samples:]
|
|
84
|
+
else:
|
|
85
|
+
train_annotations_images, test_annotations_images = train_test_split(
|
|
86
|
+
annotation_image_metadata, test_size=test_size, random_state=0
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
train_annotation_image_metadata_by_category[category_id] = train_annotations_images
|
|
90
|
+
test_annotation_image_metadata_by_category[category_id] = test_annotations_images
|
|
91
|
+
train_sample_count += len(train_annotations_images)
|
|
92
|
+
|
|
93
|
+
# Create new train2019.json
|
|
94
|
+
new_train_metadata = (
|
|
95
|
+
old_train_metadata.copy()
|
|
96
|
+
) # Keep 'info', 'categories', 'licenses' unchanged
|
|
97
|
+
new_train_metadata.update(
|
|
98
|
+
{
|
|
99
|
+
"annotations": [],
|
|
100
|
+
"images": [],
|
|
101
|
+
}
|
|
102
|
+
)
|
|
103
|
+
for category_id, annotation_image_metadata in tqdm(
|
|
104
|
+
train_annotation_image_metadata_by_category.items(),
|
|
105
|
+
desc="Creating new train2019.json",
|
|
106
|
+
total=len(train_annotation_image_metadata_by_category),
|
|
107
|
+
):
|
|
108
|
+
for annotation_image in annotation_image_metadata:
|
|
109
|
+
new_annotation = annotation_image["annotation"].copy()
|
|
110
|
+
new_train_metadata["annotations"].append(new_annotation)
|
|
111
|
+
new_image = annotation_image["image"].copy()
|
|
112
|
+
new_train_metadata["images"].append(new_image)
|
|
113
|
+
|
|
114
|
+
with open(public / "train2019.json", "w") as f:
|
|
115
|
+
json.dump(new_train_metadata, f, indent=4, sort_keys=True)
|
|
116
|
+
|
|
117
|
+
# Copy over val2019.json
|
|
118
|
+
shutil.copy(raw / "val2019.json", public / "val2019.json")
|
|
119
|
+
logger.info(f"Copied {raw / 'val2019.json'} to {public / 'val2019.json'}")
|
|
120
|
+
|
|
121
|
+
# Create new test2019.json
|
|
122
|
+
new_to_old_file_name = {}
|
|
123
|
+
new_test_metadata = old_train_metadata.copy()
|
|
124
|
+
del new_test_metadata["categories"]
|
|
125
|
+
new_test_metadata.update(
|
|
126
|
+
{
|
|
127
|
+
"annotations": [],
|
|
128
|
+
"images": [],
|
|
129
|
+
}
|
|
130
|
+
)
|
|
131
|
+
# Flatten and shuffle test set so that we don't have all the same catedgories in a row
|
|
132
|
+
test_annotations_images = [
|
|
133
|
+
item for sublist in test_annotation_image_metadata_by_category.values() for item in sublist
|
|
134
|
+
]
|
|
135
|
+
random.Random(0).shuffle(test_annotations_images)
|
|
136
|
+
for idx, annotation_image in tqdm(
|
|
137
|
+
enumerate(test_annotations_images),
|
|
138
|
+
desc="Creating new test2019.json",
|
|
139
|
+
total=len(test_annotations_images),
|
|
140
|
+
):
|
|
141
|
+
|
|
142
|
+
new_annotation = annotation_image["annotation"].copy()
|
|
143
|
+
new_test_metadata["annotations"].append(new_annotation)
|
|
144
|
+
|
|
145
|
+
new_image = annotation_image["image"].copy()
|
|
146
|
+
old_file_name = new_image["file_name"]
|
|
147
|
+
# go from e.g. "train_val2019/Plants/400/d1322d13ccd856eb4236c8b888546c79.jpg" to "test2019/d1322d13ccd856eb4236c8b888546c79.jpg"
|
|
148
|
+
new_file_name = "test2019/" + old_file_name.split("/")[-1]
|
|
149
|
+
# keep track of things so we know what to copy later
|
|
150
|
+
new_to_old_file_name[new_file_name] = old_file_name
|
|
151
|
+
new_image["file_name"] = new_file_name
|
|
152
|
+
new_test_metadata["images"].append(new_image)
|
|
153
|
+
with open(public / "test2019.json", "w") as f:
|
|
154
|
+
# The public test data, of course, doesn't have annotations
|
|
155
|
+
public_new_test = new_test_metadata.copy()
|
|
156
|
+
del public_new_test["annotations"]
|
|
157
|
+
assert public_new_test.keys() == {
|
|
158
|
+
"images",
|
|
159
|
+
"info",
|
|
160
|
+
"licenses",
|
|
161
|
+
}, f"Public test metadata keys should be 'images', 'info', 'licenses', but found {public_new_test.keys()}"
|
|
162
|
+
json.dump(public_new_test, f, indent=4, sort_keys=True)
|
|
163
|
+
|
|
164
|
+
(public / "train_val2019").mkdir(parents=True, exist_ok=True)
|
|
165
|
+
(public / "test2019").mkdir(parents=True, exist_ok=True)
|
|
166
|
+
|
|
167
|
+
# Save private test answers
|
|
168
|
+
answers_rows = []
|
|
169
|
+
for image_info, annotation_info in zip(
|
|
170
|
+
new_test_metadata["images"], new_test_metadata["annotations"]
|
|
171
|
+
):
|
|
172
|
+
assert (
|
|
173
|
+
image_info["id"] == annotation_info["image_id"]
|
|
174
|
+
), f"Mismatching image_id in image and annotation: {image_info['id']} vs {annotation_info['image_id']}"
|
|
175
|
+
answers_rows.append(
|
|
176
|
+
{
|
|
177
|
+
"id": image_info["id"],
|
|
178
|
+
"predicted": annotation_info["category_id"],
|
|
179
|
+
}
|
|
180
|
+
)
|
|
181
|
+
answers_df = pd.DataFrame(answers_rows)
|
|
182
|
+
answers_df.to_csv(private / "answers.csv", index=False)
|
|
183
|
+
|
|
184
|
+
# Create new sample submission based on answers_df
|
|
185
|
+
sample_df = answers_df.copy()
|
|
186
|
+
sample_df["predicted"] = [random.Random(42).randint(0, 1009) for _ in range(len(sample_df))]
|
|
187
|
+
sample_df.to_csv(public / "kaggle_sample_submission.csv", index=False)
|
|
188
|
+
|
|
189
|
+
assert len(answers_df) == len(
|
|
190
|
+
new_test_metadata["images"]
|
|
191
|
+
), f"Expected {len(new_test_metadata['images'])} rows in answers, but found {len(answers_df)}"
|
|
192
|
+
assert len(sample_df) == len(
|
|
193
|
+
answers_df
|
|
194
|
+
), f"Expected {len(answers_df)} rows in sample submission, but found {len(sample_df)}"
|
|
195
|
+
assert answers_df["id"].equals(
|
|
196
|
+
sample_df["id"]
|
|
197
|
+
), "Mismatched 'id' columns between answers and sample submission"
|
|
198
|
+
|
|
199
|
+
# Copy train images
|
|
200
|
+
train_images_copied = 0
|
|
201
|
+
for category_id, annotation_image_metadata in tqdm(
|
|
202
|
+
train_annotation_image_metadata_by_category.items(),
|
|
203
|
+
desc="Copying train images grouped by category",
|
|
204
|
+
):
|
|
205
|
+
for idx, annotation_image in enumerate(annotation_image_metadata):
|
|
206
|
+
if dev_mode and idx >= image_count:
|
|
207
|
+
break
|
|
208
|
+
old_path = raw / annotation_image["image"]["file_name"]
|
|
209
|
+
new_path = public / annotation_image["image"]["file_name"]
|
|
210
|
+
new_path.parent.mkdir(parents=True, exist_ok=True)
|
|
211
|
+
shutil.copy(old_path, new_path)
|
|
212
|
+
train_images_copied += 1
|
|
213
|
+
|
|
214
|
+
# Copy test images
|
|
215
|
+
test_images_copied = 0
|
|
216
|
+
for image_info in tqdm(new_test_metadata["images"], desc="Copying test images"):
|
|
217
|
+
if dev_mode and test_images_copied >= image_count:
|
|
218
|
+
break
|
|
219
|
+
old_path = raw / new_to_old_file_name[image_info["file_name"]]
|
|
220
|
+
new_path = public / image_info["file_name"]
|
|
221
|
+
new_path.parent.mkdir(parents=True, exist_ok=True)
|
|
222
|
+
shutil.copy(old_path, new_path)
|
|
223
|
+
test_images_copied += 1
|
|
224
|
+
|
|
225
|
+
logger.info(f"Copied {train_images_copied} train images and {test_images_copied} test images")
|
|
226
|
+
|
|
227
|
+
if not dev_mode:
|
|
228
|
+
assert len(list((public / "train_val2019").glob("**/*.jpg"))) == len(
|
|
229
|
+
new_train_metadata["images"]
|
|
230
|
+
), f"Mismatching number of images in train_images, got {len(list((public / 'train_val2019').glob('**/*.jpg')))}"
|
|
231
|
+
|
|
232
|
+
assert len(new_train_metadata["annotations"]) == len(
|
|
233
|
+
new_train_metadata["images"]
|
|
234
|
+
), f"Mismatching number of annotations in train_metadata, got {len(new_train_metadata['annotations'])}"
|
|
235
|
+
|
|
236
|
+
if not dev_mode:
|
|
237
|
+
assert len(list((public / "test2019").glob("**/*.jpg"))) == len(
|
|
238
|
+
new_test_metadata["images"]
|
|
239
|
+
), f"Mismatching number of images in test_images, got {len(list((public / 'test2019').glob('**/*.jpg')))}"
|
|
240
|
+
|
|
241
|
+
assert len(new_test_metadata["annotations"]) == len(
|
|
242
|
+
new_test_metadata["images"]
|
|
243
|
+
), f"Mismatching number of annotations in test_metadata, got {len(new_test_metadata['annotations'])}"
|
|
244
|
+
|
|
245
|
+
assert len(new_train_metadata["annotations"]) + len(
|
|
246
|
+
new_test_metadata["annotations"]
|
|
247
|
+
) == len(old_train_metadata["annotations"]), (
|
|
248
|
+
f"Expected {len(old_train_metadata['annotations'])} annotations in total, but found"
|
|
249
|
+
f"{len(new_train_metadata['annotations'])} in train and {len(new_test_metadata['annotations'])} in test"
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
# Re-compress images
|
|
253
|
+
add_to_tar(
|
|
254
|
+
src=public / "test2019", out=public / "test2019.tar.gz"
|
|
255
|
+
) # Add to tar but don't actually compress with gzip to save time
|
|
256
|
+
add_to_tar(src=public / "train_val2019", out=public / "train_val2019.tar.gz")
|
|
257
|
+
# Remove uncompressed directories
|
|
258
|
+
shutil.rmtree(public / "test2019")
|
|
259
|
+
shutil.rmtree(public / "train_val2019")
|
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import random
|
|
3
|
+
import shutil
|
|
4
|
+
import tarfile
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Dict, List
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from sklearn.model_selection import train_test_split
|
|
10
|
+
from tqdm import tqdm
|
|
11
|
+
|
|
12
|
+
from mlebench.utils import get_logger
|
|
13
|
+
|
|
14
|
+
logger = get_logger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def add_to_tar(src: Path, out: Path):
|
|
18
|
+
|
|
19
|
+
assert src.exists(), f"Source directory `{src}` does not exist."
|
|
20
|
+
assert src.is_dir(), f"Expected a directory, but got `{src}`."
|
|
21
|
+
|
|
22
|
+
tqdm_desc = f"Taring {src.name} to {out.name}"
|
|
23
|
+
file_paths = [path for path in src.rglob("*") if path.is_file()]
|
|
24
|
+
total_files = len(file_paths)
|
|
25
|
+
|
|
26
|
+
with tarfile.open(out, "w") as tar:
|
|
27
|
+
for file_path in tqdm(file_paths, desc=tqdm_desc, unit="file", total=total_files):
|
|
28
|
+
tar.add(file_path, arcname=file_path.relative_to(src))
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _split_by_category(
|
|
32
|
+
data_by_category: Dict, test_size: float, random_state: int
|
|
33
|
+
) -> tuple[Dict, Dict]:
|
|
34
|
+
"""Splits data for each category into train and test sets using the original script's logic."""
|
|
35
|
+
train_split_by_category = {}
|
|
36
|
+
test_split_by_category = {}
|
|
37
|
+
|
|
38
|
+
for category_id, annotation_image_metadata in tqdm(
|
|
39
|
+
data_by_category.items(),
|
|
40
|
+
desc=f"Assigning train/test splits (test_size={test_size:.3f})",
|
|
41
|
+
):
|
|
42
|
+
n_samples = len(annotation_image_metadata)
|
|
43
|
+
if n_samples == 1:
|
|
44
|
+
# If only one sample, put it in train
|
|
45
|
+
train_annotations_images = annotation_image_metadata
|
|
46
|
+
test_annotations_images = []
|
|
47
|
+
elif n_samples < 5: # Minimum 5 samples to ensure at least 1 in test
|
|
48
|
+
num_test_samples = max(1, int(n_samples * test_size))
|
|
49
|
+
train_annotations_images = annotation_image_metadata[:-num_test_samples]
|
|
50
|
+
test_annotations_images = annotation_image_metadata[-num_test_samples:]
|
|
51
|
+
else:
|
|
52
|
+
train_annotations_images, test_annotations_images = train_test_split(
|
|
53
|
+
annotation_image_metadata, test_size=test_size, random_state=random_state
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
train_split_by_category[category_id] = train_annotations_images
|
|
57
|
+
test_split_by_category[category_id] = test_annotations_images
|
|
58
|
+
|
|
59
|
+
return train_split_by_category, test_split_by_category
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _generate_split_files(
|
|
63
|
+
train_annotation_image_metadata_by_category: Dict,
|
|
64
|
+
test_annotation_image_metadata_by_category: Dict,
|
|
65
|
+
old_train_metadata: Dict,
|
|
66
|
+
raw_path: Path,
|
|
67
|
+
public_path: Path,
|
|
68
|
+
private_path: Path,
|
|
69
|
+
dev_mode: bool,
|
|
70
|
+
image_count: int,
|
|
71
|
+
):
|
|
72
|
+
"""
|
|
73
|
+
Processes a given train/test split and saves all corresponding files
|
|
74
|
+
(metadata, images, private answers, etc.) to the specified output directories.
|
|
75
|
+
"""
|
|
76
|
+
public_path.mkdir(parents=True, exist_ok=True)
|
|
77
|
+
private_path.mkdir(parents=True, exist_ok=True)
|
|
78
|
+
|
|
79
|
+
# Create new train2019.json
|
|
80
|
+
new_train_metadata = (
|
|
81
|
+
old_train_metadata.copy()
|
|
82
|
+
) # Keep 'info', 'categories', 'licenses' unchanged
|
|
83
|
+
new_train_metadata.update(
|
|
84
|
+
{
|
|
85
|
+
"annotations": [],
|
|
86
|
+
"images": [],
|
|
87
|
+
}
|
|
88
|
+
)
|
|
89
|
+
for category_id, annotation_image_metadata in tqdm(
|
|
90
|
+
train_annotation_image_metadata_by_category.items(),
|
|
91
|
+
desc=f"[{public_path.name}] Creating new train2019.json",
|
|
92
|
+
total=len(train_annotation_image_metadata_by_category),
|
|
93
|
+
):
|
|
94
|
+
for annotation_image in annotation_image_metadata:
|
|
95
|
+
new_annotation = annotation_image["annotation"].copy()
|
|
96
|
+
new_train_metadata["annotations"].append(new_annotation)
|
|
97
|
+
new_image = annotation_image["image"].copy()
|
|
98
|
+
new_train_metadata["images"].append(new_image)
|
|
99
|
+
|
|
100
|
+
with open(public_path / "train2019.json", "w") as f:
|
|
101
|
+
json.dump(new_train_metadata, f, indent=4, sort_keys=True)
|
|
102
|
+
|
|
103
|
+
# Copy over val2019.json
|
|
104
|
+
shutil.copy(raw_path / "val2019.json", public_path / "val2019.json")
|
|
105
|
+
|
|
106
|
+
# Create new test2019.json
|
|
107
|
+
new_to_old_file_name = {}
|
|
108
|
+
new_test_metadata = old_train_metadata.copy()
|
|
109
|
+
del new_test_metadata["categories"]
|
|
110
|
+
new_test_metadata.update(
|
|
111
|
+
{
|
|
112
|
+
"annotations": [],
|
|
113
|
+
"images": [],
|
|
114
|
+
}
|
|
115
|
+
)
|
|
116
|
+
# Flatten and shuffle test set so that we don't have all the same categories in a row
|
|
117
|
+
test_annotations_images = [
|
|
118
|
+
item
|
|
119
|
+
for sublist in test_annotation_image_metadata_by_category.values()
|
|
120
|
+
for item in sublist
|
|
121
|
+
]
|
|
122
|
+
random.Random(0).shuffle(test_annotations_images)
|
|
123
|
+
for idx, annotation_image in tqdm(
|
|
124
|
+
enumerate(test_annotations_images),
|
|
125
|
+
desc=f"[{public_path.name}] Creating new test2019.json",
|
|
126
|
+
total=len(test_annotations_images),
|
|
127
|
+
):
|
|
128
|
+
new_annotation = annotation_image["annotation"].copy()
|
|
129
|
+
new_test_metadata["annotations"].append(new_annotation)
|
|
130
|
+
|
|
131
|
+
new_image = annotation_image["image"].copy()
|
|
132
|
+
old_file_name = new_image["file_name"]
|
|
133
|
+
# go from e.g. "train_val2019/Plants/400/d1322d13ccd856eb4236c8b888546c79.jpg" to "test2019/d1322d13ccd856eb4236c8b888546c79.jpg"
|
|
134
|
+
new_file_name = "test2019/" + old_file_name.split("/")[-1]
|
|
135
|
+
# keep track of things so we know what to copy later
|
|
136
|
+
new_to_old_file_name[new_file_name] = old_file_name
|
|
137
|
+
new_image["file_name"] = new_file_name
|
|
138
|
+
new_test_metadata["images"].append(new_image)
|
|
139
|
+
|
|
140
|
+
with open(public_path / "test2019.json", "w") as f:
|
|
141
|
+
# The public test data, of course, doesn't have annotations
|
|
142
|
+
public_new_test = new_test_metadata.copy()
|
|
143
|
+
del public_new_test["annotations"]
|
|
144
|
+
assert public_new_test.keys() == {
|
|
145
|
+
"images",
|
|
146
|
+
"info",
|
|
147
|
+
"licenses",
|
|
148
|
+
}, f"Public test metadata keys should be 'images', 'info', 'licenses', but found {public_new_test.keys()}"
|
|
149
|
+
json.dump(public_new_test, f, indent=4, sort_keys=True)
|
|
150
|
+
|
|
151
|
+
(public_path / "train_val2019").mkdir(parents=True, exist_ok=True)
|
|
152
|
+
(public_path / "test2019").mkdir(parents=True, exist_ok=True)
|
|
153
|
+
|
|
154
|
+
# Save private test answers
|
|
155
|
+
answers_rows = []
|
|
156
|
+
for image_info, annotation_info in zip(
|
|
157
|
+
new_test_metadata["images"], new_test_metadata["annotations"]
|
|
158
|
+
):
|
|
159
|
+
assert (
|
|
160
|
+
image_info["id"] == annotation_info["image_id"]
|
|
161
|
+
), f"Mismatching image_id in image and annotation: {image_info['id']} vs {annotation_info['image_id']}"
|
|
162
|
+
answers_rows.append(
|
|
163
|
+
{
|
|
164
|
+
"id": image_info["id"],
|
|
165
|
+
"predicted": annotation_info["category_id"],
|
|
166
|
+
}
|
|
167
|
+
)
|
|
168
|
+
answers_df = pd.DataFrame(answers_rows)
|
|
169
|
+
answers_df.to_csv(private_path / "answers.csv", index=False)
|
|
170
|
+
|
|
171
|
+
# Create new sample submission based on answers_df
|
|
172
|
+
sample_df = answers_df.copy()
|
|
173
|
+
sample_df["predicted"] = [random.Random(42).randint(0, 1009) for _ in range(len(sample_df))]
|
|
174
|
+
sample_df.to_csv(public_path / "kaggle_sample_submission.csv", index=False)
|
|
175
|
+
|
|
176
|
+
# Copy train images
|
|
177
|
+
for annotation_image_metadata in tqdm(
|
|
178
|
+
train_annotation_image_metadata_by_category.values(),
|
|
179
|
+
desc=f"[{public_path.name}] Copying train images",
|
|
180
|
+
):
|
|
181
|
+
for idx, annotation_image in enumerate(annotation_image_metadata):
|
|
182
|
+
if dev_mode and idx >= image_count:
|
|
183
|
+
break
|
|
184
|
+
old_path = raw_path / annotation_image["image"]["file_name"]
|
|
185
|
+
new_path = public_path / annotation_image["image"]["file_name"]
|
|
186
|
+
new_path.parent.mkdir(parents=True, exist_ok=True)
|
|
187
|
+
shutil.copy(old_path, new_path)
|
|
188
|
+
|
|
189
|
+
# Copy test images
|
|
190
|
+
for image_info in tqdm(
|
|
191
|
+
new_test_metadata["images"], desc=f"[{public_path.name}] Copying test images"
|
|
192
|
+
):
|
|
193
|
+
if dev_mode and len(new_to_old_file_name) >= image_count:
|
|
194
|
+
break
|
|
195
|
+
old_path = raw_path / new_to_old_file_name[image_info["file_name"]]
|
|
196
|
+
new_path = public_path / image_info["file_name"]
|
|
197
|
+
new_path.parent.mkdir(parents=True, exist_ok=True)
|
|
198
|
+
shutil.copy(old_path, new_path)
|
|
199
|
+
|
|
200
|
+
# Re-compress images
|
|
201
|
+
add_to_tar(
|
|
202
|
+
src=public_path / "test2019", out=public_path / "test2019.tar.gz"
|
|
203
|
+
) # Add to tar but don't actually compress with gzip to save time
|
|
204
|
+
add_to_tar(src=public_path / "train_val2019", out=public_path / "train_val2019.tar.gz")
|
|
205
|
+
# Remove uncompressed directories
|
|
206
|
+
shutil.rmtree(public_path / "test2019")
|
|
207
|
+
shutil.rmtree(public_path / "train_val2019")
|
|
208
|
+
logger.info(f"Finished generating files for {public_path.name}")
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def prepare(raw: Path, public: Path, private: Path):
|
|
212
|
+
|
|
213
|
+
dev_mode = False
|
|
214
|
+
image_count = 2 if dev_mode else float("inf") # We copy over 2 images per category for dev mode
|
|
215
|
+
|
|
216
|
+
# Extract train_val2019.tar.gz which contains images
|
|
217
|
+
train_tar_path = raw / "train_val2019.tar.gz"
|
|
218
|
+
train_extract_path = raw
|
|
219
|
+
if not (raw / "train_val2019").exists():
|
|
220
|
+
logger.info("Extracting raw image data...")
|
|
221
|
+
shutil.unpack_archive(train_tar_path, train_extract_path)
|
|
222
|
+
|
|
223
|
+
# Create train, test from train split
|
|
224
|
+
json_path = raw / "train2019.json"
|
|
225
|
+
with open(json_path, "r", encoding="utf-8") as f:
|
|
226
|
+
old_train_metadata = json.load(f)
|
|
227
|
+
|
|
228
|
+
# Organize data by category so that we can split per-category later
|
|
229
|
+
annotation_image_metadata_by_category = {} # We'll collect both `annotations` and `images` here
|
|
230
|
+
for annotation_info, image_info in list(
|
|
231
|
+
zip(old_train_metadata["annotations"], old_train_metadata["images"])
|
|
232
|
+
):
|
|
233
|
+
assert (
|
|
234
|
+
annotation_info["image_id"] == image_info["id"]
|
|
235
|
+
), f"Mismatching image_id in annotation and image: {annotation_info['image_id']} vs {image_info['id']}"
|
|
236
|
+
category_id = annotation_info["category_id"]
|
|
237
|
+
if category_id not in annotation_image_metadata_by_category:
|
|
238
|
+
annotation_image_metadata_by_category[category_id] = []
|
|
239
|
+
annotation_image_metadata_by_category[category_id].append(
|
|
240
|
+
{
|
|
241
|
+
"annotation": annotation_info,
|
|
242
|
+
"image": image_info,
|
|
243
|
+
}
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
# --- 1. Original Data Split (Train/Test) ---
|
|
247
|
+
logger.info("--- Generating Original Train/Test Split ---")
|
|
248
|
+
# Original train+val has 268,243 images, test has 35,400 images, ~0.12 ratio
|
|
249
|
+
original_test_size = 0.12
|
|
250
|
+
(
|
|
251
|
+
original_train_split,
|
|
252
|
+
original_test_split,
|
|
253
|
+
) = _split_by_category(
|
|
254
|
+
annotation_image_metadata_by_category,
|
|
255
|
+
test_size=original_test_size,
|
|
256
|
+
random_state=0,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
_generate_split_files(
|
|
260
|
+
original_train_split,
|
|
261
|
+
original_test_split,
|
|
262
|
+
old_train_metadata,
|
|
263
|
+
raw,
|
|
264
|
+
public,
|
|
265
|
+
private,
|
|
266
|
+
dev_mode,
|
|
267
|
+
image_count,
|
|
268
|
+
)
|
|
269
|
+
logger.info(f"Original split saved to {public.name} and {private.name}")
|
|
270
|
+
|
|
271
|
+
# --- 2. New Validation Data Split (Train/Val) ---
|
|
272
|
+
logger.info("--- Generating New Train/Validation Split ---")
|
|
273
|
+
# Define new output directories
|
|
274
|
+
public_val = public.parent / "public_val"
|
|
275
|
+
private_val = private.parent / "private_val"
|
|
276
|
+
|
|
277
|
+
# We want the new validation set ('test_val') to have the same size as the
|
|
278
|
+
# original test set. We are splitting the *original_train_split* to get it.
|
|
279
|
+
# test_val_size = new_test_size * train_size
|
|
280
|
+
# We want: test_val_size ≈ test_size
|
|
281
|
+
# So: new_test_size * (N * (1-0.12)) ≈ N * 0.12
|
|
282
|
+
# new_test_size ≈ 0.12 / (1 - 0.12)
|
|
283
|
+
val_split_test_size = original_test_size / (1.0 - original_test_size)
|
|
284
|
+
|
|
285
|
+
(
|
|
286
|
+
validation_train_split,
|
|
287
|
+
validation_test_split,
|
|
288
|
+
) = _split_by_category(
|
|
289
|
+
original_train_split, # Split the TRAIN set from the first split
|
|
290
|
+
test_size=val_split_test_size,
|
|
291
|
+
random_state=0, # Use same random state for consistency
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
_generate_split_files(
|
|
295
|
+
validation_train_split,
|
|
296
|
+
validation_test_split,
|
|
297
|
+
old_train_metadata,
|
|
298
|
+
raw,
|
|
299
|
+
public_val,
|
|
300
|
+
private_val,
|
|
301
|
+
dev_mode,
|
|
302
|
+
image_count,
|
|
303
|
+
)
|
|
304
|
+
logger.info(f"Validation split saved to {public_val.name} and {private_val.name}")
|
|
File without changes
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from sklearn.metrics import roc_auc_score
|
|
4
|
+
|
|
5
|
+
from mlebench.grade_helpers import InvalidSubmissionError
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame):
|
|
9
|
+
"""Prepare submission and answers for AUC-ROC calculation."""
|
|
10
|
+
if len(submission) != len(answers):
|
|
11
|
+
raise InvalidSubmissionError(
|
|
12
|
+
f"Submission length ({len(submission)}) != answers length ({len(answers)})"
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
if "id" not in submission.columns:
|
|
16
|
+
raise InvalidSubmissionError("Submission must have an 'id' column")
|
|
17
|
+
|
|
18
|
+
if "target" not in submission.columns:
|
|
19
|
+
raise InvalidSubmissionError("Submission must have a 'target' column")
|
|
20
|
+
|
|
21
|
+
# Sort by id
|
|
22
|
+
submission = submission.sort_values("id").reset_index(drop=True)
|
|
23
|
+
answers = answers.sort_values("id").reset_index(drop=True)
|
|
24
|
+
|
|
25
|
+
# Check id alignment
|
|
26
|
+
if not (submission["id"] == answers["id"]).all():
|
|
27
|
+
raise InvalidSubmissionError("Submission and answers id mismatch")
|
|
28
|
+
|
|
29
|
+
# Extract predictions and true values
|
|
30
|
+
y_pred = submission["target"].values
|
|
31
|
+
y_true = answers["target"].values
|
|
32
|
+
|
|
33
|
+
# Validate predictions
|
|
34
|
+
try:
|
|
35
|
+
y_pred = y_pred.astype(float)
|
|
36
|
+
except (ValueError, TypeError):
|
|
37
|
+
raise InvalidSubmissionError("Predictions must be numeric")
|
|
38
|
+
|
|
39
|
+
if np.any(np.isnan(y_pred)):
|
|
40
|
+
raise InvalidSubmissionError("Predictions cannot contain NaN values")
|
|
41
|
+
|
|
42
|
+
if np.any((y_pred < 0) | (y_pred > 1)):
|
|
43
|
+
raise InvalidSubmissionError("Predictions must be probabilities between 0 and 1")
|
|
44
|
+
|
|
45
|
+
return y_true, y_pred
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
|
|
49
|
+
"""Calculate AUC-ROC score."""
|
|
50
|
+
y_true, y_pred = prepare_for_metric(submission, answers)
|
|
51
|
+
|
|
52
|
+
# AUC-ROC
|
|
53
|
+
auc = roc_auc_score(y_true, y_pred)
|
|
54
|
+
|
|
55
|
+
return auc
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def prepare(raw: Path, public: Path, private: Path):
|
|
6
|
+
"""
|
|
7
|
+
Prepare instant-gratification dataset.
|
|
8
|
+
"""
|
|
9
|
+
# Read data
|
|
10
|
+
train = pd.read_csv(raw / "train.csv")
|
|
11
|
+
test = pd.read_csv(raw / "test.csv")
|
|
12
|
+
sample_submission = pd.read_csv(raw / "sample_submission.csv")
|
|
13
|
+
test_answer = pd.read_csv(raw / "test_answer.csv")
|
|
14
|
+
|
|
15
|
+
# Public files (visible to agents)
|
|
16
|
+
train.to_csv(public / "train.csv", index=False)
|
|
17
|
+
test.to_csv(public / "test.csv", index=False)
|
|
18
|
+
sample_submission.to_csv(public / "sample_submission.csv", index=False)
|
|
19
|
+
|
|
20
|
+
# Private files (for grading)
|
|
21
|
+
test_answer.to_csv(private / "test.csv", index=False)
|
|
22
|
+
|
|
23
|
+
# Validation checks
|
|
24
|
+
assert len(test_answer) == len(sample_submission), \
|
|
25
|
+
f"Test answer ({len(test_answer)}) and sample submission ({len(sample_submission)}) must have same length"
|
|
File without changes
|