dslighting 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dslighting/__init__.py +1 -1
- dslighting/core/agent.py +78 -62
- {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/METADATA +1 -1
- {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/RECORD +352 -7
- {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/top_level.txt +1 -0
- mlebench/README.md +39 -0
- mlebench/__init__.py +0 -0
- mlebench/cli.py +221 -0
- mlebench/competitions/3d-object-detection-for-autonomous-vehicles/grade.py +161 -0
- mlebench/competitions/3d-object-detection-for-autonomous-vehicles/mAP_evaluation.py +425 -0
- mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare.py +483 -0
- mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare_val.py +719 -0
- mlebench/competitions/AI4Code/grade.py +70 -0
- mlebench/competitions/AI4Code/prepare.py +84 -0
- mlebench/competitions/AI4Code/prepare_val.py +159 -0
- mlebench/competitions/__init__.py +0 -0
- mlebench/competitions/aerial-cactus-identification/grade.py +11 -0
- mlebench/competitions/aerial-cactus-identification/prepare.py +71 -0
- mlebench/competitions/aerial-cactus-identification/prepare_val.py +133 -0
- mlebench/competitions/alaska2-image-steganalysis/grade.py +136 -0
- mlebench/competitions/alaska2-image-steganalysis/prepare.py +88 -0
- mlebench/competitions/alaska2-image-steganalysis/prepare_val.py +148 -0
- mlebench/competitions/aptos2019-blindness-detection/grade.py +35 -0
- mlebench/competitions/aptos2019-blindness-detection/prepare.py +75 -0
- mlebench/competitions/aptos2019-blindness-detection/prepare_val.py +123 -0
- mlebench/competitions/bike-sharing-demand/__init__.py +0 -0
- mlebench/competitions/bike-sharing-demand/grade.py +55 -0
- mlebench/competitions/bike-sharing-demand/prepare.py +37 -0
- mlebench/competitions/billion-word-imputation/grade.py +37 -0
- mlebench/competitions/billion-word-imputation/prepare.py +107 -0
- mlebench/competitions/billion-word-imputation/prepare_val.py +179 -0
- mlebench/competitions/bms-molecular-translation/grade.py +40 -0
- mlebench/competitions/bms-molecular-translation/prepare.py +68 -0
- mlebench/competitions/bms-molecular-translation/prepare_val.py +131 -0
- mlebench/competitions/cassava-leaf-disease-classification/grade.py +12 -0
- mlebench/competitions/cassava-leaf-disease-classification/prepare.py +113 -0
- mlebench/competitions/cassava-leaf-disease-classification/prepare_val.py +186 -0
- mlebench/competitions/cdiscount-image-classification-challenge/grade.py +11 -0
- mlebench/competitions/cdiscount-image-classification-challenge/prepare.py +144 -0
- mlebench/competitions/cdiscount-image-classification-challenge/prepare_val.py +205 -0
- mlebench/competitions/chaii-hindi-and-tamil-question-answering/grade.py +67 -0
- mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare.py +31 -0
- mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare_val.py +94 -0
- mlebench/competitions/champs-scalar-coupling/grade.py +60 -0
- mlebench/competitions/champs-scalar-coupling/prepare.py +116 -0
- mlebench/competitions/champs-scalar-coupling/prepare_val.py +155 -0
- mlebench/competitions/conways-reverse-game-of-life-2020/__init__.py +0 -0
- mlebench/competitions/conways-reverse-game-of-life-2020/grade.py +40 -0
- mlebench/competitions/conways-reverse-game-of-life-2020/prepare.py +41 -0
- mlebench/competitions/demand-forecasting-kernels-only/__init__.py +0 -0
- mlebench/competitions/demand-forecasting-kernels-only/grade.py +66 -0
- mlebench/competitions/demand-forecasting-kernels-only/prepare.py +27 -0
- mlebench/competitions/demand_forecasting_kernels_only/__init__.py +0 -0
- mlebench/competitions/demand_forecasting_kernels_only/grade.py +66 -0
- mlebench/competitions/demand_forecasting_kernels_only/prepare.py +27 -0
- mlebench/competitions/denoising-dirty-documents/grade.py +44 -0
- mlebench/competitions/denoising-dirty-documents/prepare.py +134 -0
- mlebench/competitions/denoising-dirty-documents/prepare_val.py +178 -0
- mlebench/competitions/detecting-insults-in-social-commentary/grade.py +11 -0
- mlebench/competitions/detecting-insults-in-social-commentary/prepare.py +72 -0
- mlebench/competitions/detecting-insults-in-social-commentary/prepare_val.py +128 -0
- mlebench/competitions/dog-breed-identification/dogs.py +124 -0
- mlebench/competitions/dog-breed-identification/grade.py +42 -0
- mlebench/competitions/dog-breed-identification/prepare.py +55 -0
- mlebench/competitions/dog-breed-identification/prepare_val.py +104 -0
- mlebench/competitions/dogs-vs-cats-redux-kernels-edition/grade.py +43 -0
- mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare.py +70 -0
- mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare_val.py +143 -0
- mlebench/competitions/ethanol-concentration/grade.py +23 -0
- mlebench/competitions/ethanol-concentration/prepare.py +90 -0
- mlebench/competitions/facebook-recruiting-iii-keyword-extraction/grade.py +60 -0
- mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare.py +41 -0
- mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare_val.py +92 -0
- mlebench/competitions/feedback-prize-english-language-learning/__init__.py +0 -0
- mlebench/competitions/feedback-prize-english-language-learning/grade.py +60 -0
- mlebench/competitions/feedback-prize-english-language-learning/prepare.py +39 -0
- mlebench/competitions/freesound-audio-tagging-2019/grade.py +64 -0
- mlebench/competitions/freesound-audio-tagging-2019/prepare.py +94 -0
- mlebench/competitions/freesound-audio-tagging-2019/prepare_val.py +175 -0
- mlebench/competitions/freesound-audio-tagging-2019/vocabulary.py +83 -0
- mlebench/competitions/google-quest-challenge/classes.py +32 -0
- mlebench/competitions/google-quest-challenge/grade.py +45 -0
- mlebench/competitions/google-quest-challenge/prepare.py +58 -0
- mlebench/competitions/google-quest-challenge/prepare_val.py +120 -0
- mlebench/competitions/google-research-identify-contrails-reduce-global-warming/grade.py +77 -0
- mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare.py +155 -0
- mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare_val.py +211 -0
- mlebench/competitions/h-and-m-personalized-fashion-recommendations/grade.py +42 -0
- mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare.py +102 -0
- mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare_val.py +132 -0
- mlebench/competitions/handwriting/grade.py +23 -0
- mlebench/competitions/handwriting/prepare.py +179 -0
- mlebench/competitions/herbarium-2020-fgvc7/grade.py +34 -0
- mlebench/competitions/herbarium-2020-fgvc7/prepare.py +251 -0
- mlebench/competitions/herbarium-2020-fgvc7/prepare_val.py +242 -0
- mlebench/competitions/herbarium-2021-fgvc8/grade.py +34 -0
- mlebench/competitions/herbarium-2021-fgvc8/prepare.py +251 -0
- mlebench/competitions/herbarium-2021-fgvc8/prepare_val.py +222 -0
- mlebench/competitions/herbarium-2022-fgvc9/grade.py +31 -0
- mlebench/competitions/herbarium-2022-fgvc9/prepare.py +233 -0
- mlebench/competitions/herbarium-2022-fgvc9/prepare_val.py +213 -0
- mlebench/competitions/histopathologic-cancer-detection/grade.py +12 -0
- mlebench/competitions/histopathologic-cancer-detection/prepare.py +59 -0
- mlebench/competitions/histopathologic-cancer-detection/prepare_val.py +131 -0
- mlebench/competitions/hms-harmful-brain-activity-classification/constants.py +9 -0
- mlebench/competitions/hms-harmful-brain-activity-classification/grade.py +43 -0
- mlebench/competitions/hms-harmful-brain-activity-classification/kaggle_metric_utilities.py +96 -0
- mlebench/competitions/hms-harmful-brain-activity-classification/kullback_leibler_divergence.py +118 -0
- mlebench/competitions/hms-harmful-brain-activity-classification/prepare.py +121 -0
- mlebench/competitions/hms-harmful-brain-activity-classification/prepare_val.py +190 -0
- mlebench/competitions/hotel-id-2021-fgvc8/grade.py +41 -0
- mlebench/competitions/hotel-id-2021-fgvc8/prepare.py +63 -0
- mlebench/competitions/hotel-id-2021-fgvc8/prepare_val.py +132 -0
- mlebench/competitions/hubmap-kidney-segmentation/grade.py +62 -0
- mlebench/competitions/hubmap-kidney-segmentation/prepare.py +108 -0
- mlebench/competitions/hubmap-kidney-segmentation/prepare_val.py +153 -0
- mlebench/competitions/icecube-neutrinos-in-deep-ice/grade.py +111 -0
- mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare.py +127 -0
- mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare_val.py +183 -0
- mlebench/competitions/ili/grade.py +60 -0
- mlebench/competitions/ili/prepare.py +99 -0
- mlebench/competitions/imet-2020-fgvc7/grade.py +54 -0
- mlebench/competitions/imet-2020-fgvc7/prepare.py +77 -0
- mlebench/competitions/imet-2020-fgvc7/prepare_val.py +157 -0
- mlebench/competitions/inaturalist-2019-fgvc6/grade.py +35 -0
- mlebench/competitions/inaturalist-2019-fgvc6/prepare.py +259 -0
- mlebench/competitions/inaturalist-2019-fgvc6/prepare_val.py +304 -0
- mlebench/competitions/instant-gratification/__init__.py +0 -0
- mlebench/competitions/instant-gratification/grade.py +55 -0
- mlebench/competitions/instant-gratification/prepare.py +25 -0
- mlebench/competitions/instant_gratification/__init__.py +0 -0
- mlebench/competitions/instant_gratification/grade.py +55 -0
- mlebench/competitions/instant_gratification/prepare.py +25 -0
- mlebench/competitions/invasive-species-monitoring/grade.py +11 -0
- mlebench/competitions/invasive-species-monitoring/prepare.py +97 -0
- mlebench/competitions/invasive-species-monitoring/prepare_val.py +164 -0
- mlebench/competitions/iwildcam-2019-fgvc6/grade.py +44 -0
- mlebench/competitions/iwildcam-2019-fgvc6/prepare.py +118 -0
- mlebench/competitions/iwildcam-2019-fgvc6/prepare_val.py +194 -0
- mlebench/competitions/iwildcam-2020-fgvc7/grade.py +11 -0
- mlebench/competitions/iwildcam-2020-fgvc7/prepare.py +164 -0
- mlebench/competitions/iwildcam-2020-fgvc7/prepare_val.py +245 -0
- mlebench/competitions/jigsaw-toxic-comment-classification-challenge/classes.py +1 -0
- mlebench/competitions/jigsaw-toxic-comment-classification-challenge/grade.py +54 -0
- mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare.py +42 -0
- mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare_val.py +88 -0
- mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/grade.py +153 -0
- mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare.py +36 -0
- mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare_val.py +117 -0
- mlebench/competitions/kuzushiji-recognition/grade.py +58 -0
- mlebench/competitions/kuzushiji-recognition/kuzushiji_metric.py +118 -0
- mlebench/competitions/kuzushiji-recognition/prepare.py +92 -0
- mlebench/competitions/kuzushiji-recognition/prepare_val.py +149 -0
- mlebench/competitions/leaf-classification/classes.py +101 -0
- mlebench/competitions/leaf-classification/grade.py +44 -0
- mlebench/competitions/leaf-classification/prepare.py +60 -0
- mlebench/competitions/leaf-classification/prepare_val.py +116 -0
- mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/grade.py +44 -0
- mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare.py +51 -0
- mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare_val.py +96 -0
- mlebench/competitions/liverpool-ion-switching/__init__.py +0 -0
- mlebench/competitions/liverpool-ion-switching/grade.py +52 -0
- mlebench/competitions/liverpool-ion-switching/prepare.py +27 -0
- mlebench/competitions/liverpool_ion_switching/__init__.py +0 -0
- mlebench/competitions/liverpool_ion_switching/grade.py +52 -0
- mlebench/competitions/liverpool_ion_switching/prepare.py +27 -0
- mlebench/competitions/lmsys-chatbot-arena/grade.py +63 -0
- mlebench/competitions/lmsys-chatbot-arena/prepare.py +52 -0
- mlebench/competitions/lmsys-chatbot-arena/prepare_val.py +115 -0
- mlebench/competitions/mcm_2024_c_test/grade.py +107 -0
- mlebench/competitions/mcm_2024_c_test/prepare.py +2 -0
- mlebench/competitions/ml2021spring-hw2/grade.py +11 -0
- mlebench/competitions/ml2021spring-hw2/prepare.py +58 -0
- mlebench/competitions/ml2021spring-hw2/prepare_val.py +135 -0
- mlebench/competitions/mlsp-2013-birds/grade.py +11 -0
- mlebench/competitions/mlsp-2013-birds/prepare.py +182 -0
- mlebench/competitions/mlsp-2013-birds/prepare_val.py +241 -0
- mlebench/competitions/movie-review-sentiment-analysis-kernels-only/grade.py +11 -0
- mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare.py +58 -0
- mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare_val.py +120 -0
- mlebench/competitions/multi-modal-gesture-recognition/grade.py +58 -0
- mlebench/competitions/multi-modal-gesture-recognition/prepare.py +85 -0
- mlebench/competitions/multi-modal-gesture-recognition/prepare_val.py +139 -0
- mlebench/competitions/my-custom-task-01/prepare.py +2 -0
- mlebench/competitions/new-my-task-01/prepare.py +2 -0
- mlebench/competitions/new-my-task-03/grade.py +107 -0
- mlebench/competitions/new-my-task-03/prepare.py +2 -0
- mlebench/competitions/new-york-city-taxi-fare-prediction/grade.py +28 -0
- mlebench/competitions/new-york-city-taxi-fare-prediction/prepare.py +44 -0
- mlebench/competitions/new-york-city-taxi-fare-prediction/prepare_val.py +89 -0
- mlebench/competitions/nfl-player-contact-detection/grade.py +36 -0
- mlebench/competitions/nfl-player-contact-detection/prepare.py +101 -0
- mlebench/competitions/nfl-player-contact-detection/prepare_val.py +186 -0
- mlebench/competitions/nomad2018-predict-transparent-conductors/grade.py +47 -0
- mlebench/competitions/nomad2018-predict-transparent-conductors/prepare.py +77 -0
- mlebench/competitions/nomad2018-predict-transparent-conductors/prepare_val.py +144 -0
- mlebench/competitions/osic-pulmonary-fibrosis-progression/grade.py +74 -0
- mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare.py +95 -0
- mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare_val.py +167 -0
- mlebench/competitions/paddy-disease-classification/grade.py +35 -0
- mlebench/competitions/paddy-disease-classification/prepare.py +69 -0
- mlebench/competitions/paddy-disease-classification/prepare_val.py +122 -0
- mlebench/competitions/petfinder-pawpularity-score/grade.py +41 -0
- mlebench/competitions/petfinder-pawpularity-score/prepare.py +76 -0
- mlebench/competitions/petfinder-pawpularity-score/prepare_val.py +154 -0
- mlebench/competitions/plant-pathology-2020-fgvc7/grade.py +41 -0
- mlebench/competitions/plant-pathology-2020-fgvc7/prepare.py +74 -0
- mlebench/competitions/plant-pathology-2020-fgvc7/prepare_val.py +160 -0
- mlebench/competitions/plant-pathology-2021-fgvc8/grade.py +54 -0
- mlebench/competitions/plant-pathology-2021-fgvc8/prepare.py +65 -0
- mlebench/competitions/plant-pathology-2021-fgvc8/prepare_val.py +130 -0
- mlebench/competitions/plant-seedlings-classification/grade.py +39 -0
- mlebench/competitions/plant-seedlings-classification/prepare.py +91 -0
- mlebench/competitions/plant-seedlings-classification/prepare_val.py +158 -0
- mlebench/competitions/playground-series-s3e1/__init__.py +0 -0
- mlebench/competitions/playground-series-s3e1/grade.py +52 -0
- mlebench/competitions/playground-series-s3e1/prepare.py +25 -0
- mlebench/competitions/playground-series-s3e11/__init__.py +0 -0
- mlebench/competitions/playground-series-s3e11/grade.py +55 -0
- mlebench/competitions/playground-series-s3e11/prepare.py +25 -0
- mlebench/competitions/playground-series-s3e18/grade.py +39 -0
- mlebench/competitions/playground-series-s3e18/prepare.py +36 -0
- mlebench/competitions/playground-series-s3e18/prepare_val.py +89 -0
- mlebench/competitions/playground_series_s3e1/__init__.py +0 -0
- mlebench/competitions/playground_series_s3e1/grade.py +52 -0
- mlebench/competitions/playground_series_s3e1/prepare.py +25 -0
- mlebench/competitions/playground_series_s3e11/__init__.py +0 -0
- mlebench/competitions/playground_series_s3e11/grade.py +55 -0
- mlebench/competitions/playground_series_s3e11/prepare.py +25 -0
- mlebench/competitions/predict-volcanic-eruptions-ingv-oe/grade.py +44 -0
- mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare.py +68 -0
- mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare_val.py +146 -0
- mlebench/competitions/random-acts-of-pizza/grade.py +14 -0
- mlebench/competitions/random-acts-of-pizza/prepare.py +80 -0
- mlebench/competitions/random-acts-of-pizza/prepare_val.py +144 -0
- mlebench/competitions/ranzcr-clip-catheter-line-classification/classes.py +11 -0
- mlebench/competitions/ranzcr-clip-catheter-line-classification/grade.py +31 -0
- mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare.py +53 -0
- mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare_val.py +113 -0
- mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/grade.py +124 -0
- mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare.py +219 -0
- mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare_val.py +257 -0
- mlebench/competitions/rsna-breast-cancer-detection/grade.py +65 -0
- mlebench/competitions/rsna-breast-cancer-detection/prepare.py +141 -0
- mlebench/competitions/rsna-breast-cancer-detection/prepare_val.py +201 -0
- mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/grade.py +13 -0
- mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare.py +47 -0
- mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare_val.py +97 -0
- mlebench/competitions/santander-customer-satisfaction/grade.py +10 -0
- mlebench/competitions/santander-customer-satisfaction/prepare.py +41 -0
- mlebench/competitions/sciencebench-001-clintox-nn/__init__.py +0 -0
- mlebench/competitions/sciencebench-001-clintox-nn/grade.py +56 -0
- mlebench/competitions/sciencebench-001-clintox-nn/prepare.py +75 -0
- mlebench/competitions/sciencebench-015-aai/grade.py +37 -0
- mlebench/competitions/sciencebench-015-aai/prepare.py +102 -0
- mlebench/competitions/sciencebench-051-brain-blood-qsar/grade.py +58 -0
- mlebench/competitions/sciencebench-051-brain-blood-qsar/prepare.py +69 -0
- mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/grade.py +55 -0
- mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/prepare.py +88 -0
- mlebench/competitions/see-click-predict-fix/__init__.py +0 -0
- mlebench/competitions/see-click-predict-fix/grade.py +66 -0
- mlebench/competitions/see-click-predict-fix/prepare.py +25 -0
- mlebench/competitions/see_click_predict_fix/__init__.py +0 -0
- mlebench/competitions/see_click_predict_fix/grade.py +66 -0
- mlebench/competitions/see_click_predict_fix/prepare.py +25 -0
- mlebench/competitions/seti-breakthrough-listen/grade.py +11 -0
- mlebench/competitions/seti-breakthrough-listen/prepare.py +71 -0
- mlebench/competitions/seti-breakthrough-listen/prepare_val.py +159 -0
- mlebench/competitions/siim-covid19-detection/grade.py +194 -0
- mlebench/competitions/siim-covid19-detection/prepare.py +123 -0
- mlebench/competitions/siim-covid19-detection/prepare_val.py +164 -0
- mlebench/competitions/siim-isic-melanoma-classification/grade.py +11 -0
- mlebench/competitions/siim-isic-melanoma-classification/prepare.py +127 -0
- mlebench/competitions/siim-isic-melanoma-classification/prepare_val.py +158 -0
- mlebench/competitions/smartphone-decimeter-2022/grade.py +55 -0
- mlebench/competitions/smartphone-decimeter-2022/notebook.py +86 -0
- mlebench/competitions/smartphone-decimeter-2022/prepare.py +143 -0
- mlebench/competitions/smartphone-decimeter-2022/prepare_val.py +199 -0
- mlebench/competitions/spaceship-titanic/grade.py +11 -0
- mlebench/competitions/spaceship-titanic/prepare.py +23 -0
- mlebench/competitions/spaceship-titanic/prepare_val.py +61 -0
- mlebench/competitions/spooky-author-identification/classes.py +1 -0
- mlebench/competitions/spooky-author-identification/grade.py +38 -0
- mlebench/competitions/spooky-author-identification/prepare.py +40 -0
- mlebench/competitions/spooky-author-identification/prepare_val.py +78 -0
- mlebench/competitions/stanford-covid-vaccine/grade.py +65 -0
- mlebench/competitions/stanford-covid-vaccine/prepare.py +129 -0
- mlebench/competitions/stanford-covid-vaccine/prepare_val.py +199 -0
- mlebench/competitions/statoil-iceberg-classifier-challenge/grade.py +41 -0
- mlebench/competitions/statoil-iceberg-classifier-challenge/prepare.py +105 -0
- mlebench/competitions/statoil-iceberg-classifier-challenge/prepare_val.py +157 -0
- mlebench/competitions/tabular-playground-series-dec-2021/grade.py +11 -0
- mlebench/competitions/tabular-playground-series-dec-2021/prepare.py +39 -0
- mlebench/competitions/tabular-playground-series-dec-2021/prepare_val.py +99 -0
- mlebench/competitions/tabular-playground-series-may-2022/grade.py +9 -0
- mlebench/competitions/tabular-playground-series-may-2022/prepare.py +56 -0
- mlebench/competitions/tabular-playground-series-may-2022/prepare_val.py +116 -0
- mlebench/competitions/tensorflow-speech-recognition-challenge/grade.py +11 -0
- mlebench/competitions/tensorflow-speech-recognition-challenge/prepare.py +90 -0
- mlebench/competitions/tensorflow-speech-recognition-challenge/prepare_val.py +148 -0
- mlebench/competitions/tensorflow2-question-answering/grade.py +122 -0
- mlebench/competitions/tensorflow2-question-answering/prepare.py +122 -0
- mlebench/competitions/tensorflow2-question-answering/prepare_val.py +187 -0
- mlebench/competitions/text-normalization-challenge-english-language/grade.py +49 -0
- mlebench/competitions/text-normalization-challenge-english-language/prepare.py +115 -0
- mlebench/competitions/text-normalization-challenge-english-language/prepare_val.py +213 -0
- mlebench/competitions/text-normalization-challenge-russian-language/grade.py +49 -0
- mlebench/competitions/text-normalization-challenge-russian-language/prepare.py +113 -0
- mlebench/competitions/text-normalization-challenge-russian-language/prepare_val.py +165 -0
- mlebench/competitions/tgs-salt-identification-challenge/grade.py +144 -0
- mlebench/competitions/tgs-salt-identification-challenge/prepare.py +158 -0
- mlebench/competitions/tgs-salt-identification-challenge/prepare_val.py +166 -0
- mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/grade.py +11 -0
- mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare.py +95 -0
- mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare_val.py +141 -0
- mlebench/competitions/tmdb-box-office-prediction/__init__.py +0 -0
- mlebench/competitions/tmdb-box-office-prediction/grade.py +55 -0
- mlebench/competitions/tmdb-box-office-prediction/prepare.py +35 -0
- mlebench/competitions/tweet-sentiment-extraction/grade.py +67 -0
- mlebench/competitions/tweet-sentiment-extraction/prepare.py +36 -0
- mlebench/competitions/tweet-sentiment-extraction/prepare_val.py +106 -0
- mlebench/competitions/us-patent-phrase-to-phrase-matching/grade.py +31 -0
- mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare.py +33 -0
- mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare_val.py +71 -0
- mlebench/competitions/utils.py +266 -0
- mlebench/competitions/uw-madison-gi-tract-image-segmentation/grade.py +158 -0
- mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare.py +139 -0
- mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare_val.py +193 -0
- mlebench/competitions/ventilator-pressure-prediction/__init__.py +0 -0
- mlebench/competitions/ventilator-pressure-prediction/grade.py +52 -0
- mlebench/competitions/ventilator-pressure-prediction/prepare.py +27 -0
- mlebench/competitions/ventilator-pressure-prediction/prepare_val.py +142 -0
- mlebench/competitions/ventilator_pressure_prediction/__init__.py +0 -0
- mlebench/competitions/ventilator_pressure_prediction/grade.py +52 -0
- mlebench/competitions/ventilator_pressure_prediction/prepare.py +27 -0
- mlebench/competitions/vesuvius-challenge-ink-detection/grade.py +97 -0
- mlebench/competitions/vesuvius-challenge-ink-detection/prepare.py +122 -0
- mlebench/competitions/vesuvius-challenge-ink-detection/prepare_val.py +170 -0
- mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/grade.py +220 -0
- mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare.py +129 -0
- mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare_val.py +204 -0
- mlebench/competitions/whale-categorization-playground/grade.py +41 -0
- mlebench/competitions/whale-categorization-playground/prepare.py +103 -0
- mlebench/competitions/whale-categorization-playground/prepare_val.py +196 -0
- mlebench/data.py +420 -0
- mlebench/grade.py +209 -0
- mlebench/grade_helpers.py +235 -0
- mlebench/metrics.py +75 -0
- mlebench/registry.py +332 -0
- mlebench/utils.py +346 -0
- {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/WHEEL +0 -0
- {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
import shutil
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from sklearn.model_selection import train_test_split
|
|
6
|
+
from tqdm.auto import tqdm
|
|
7
|
+
|
|
8
|
+
from mlebench.competitions.utils import get_ids_from_tf_records
|
|
9
|
+
from mlebench.utils import read_csv
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _process_and_save_split(
|
|
13
|
+
train_df: pd.DataFrame,
|
|
14
|
+
test_df: pd.DataFrame,
|
|
15
|
+
train_tfrecords: list,
|
|
16
|
+
test_tfrecords: list,
|
|
17
|
+
public_dir: Path,
|
|
18
|
+
private_dir: Path,
|
|
19
|
+
raw_dir: Path,
|
|
20
|
+
):
|
|
21
|
+
"""
|
|
22
|
+
Helper function to process a single data split (train/test) and save all
|
|
23
|
+
associated files to the specified public and private directories.
|
|
24
|
+
"""
|
|
25
|
+
# Create output directories
|
|
26
|
+
public_dir.mkdir(parents=True, exist_ok=True)
|
|
27
|
+
private_dir.mkdir(parents=True, exist_ok=True)
|
|
28
|
+
|
|
29
|
+
# --- Create and save CSV files ---
|
|
30
|
+
sample_submission = test_df.copy()
|
|
31
|
+
sample_submission["label"] = 4 # Default label for submission template
|
|
32
|
+
|
|
33
|
+
train_df.to_csv(public_dir / "train.csv", index=False)
|
|
34
|
+
test_df.to_csv(private_dir / "test.csv", index=False) # Ground truth
|
|
35
|
+
sample_submission.to_csv(public_dir / "sample_submission.csv", index=False)
|
|
36
|
+
|
|
37
|
+
# --- Copy and rename TFRecord files ---
|
|
38
|
+
(public_dir / "train_tfrecords").mkdir(exist_ok=True)
|
|
39
|
+
for i, path in tqdm(
|
|
40
|
+
enumerate(train_tfrecords),
|
|
41
|
+
desc=f"Copying Train TFRecords to {public_dir.name}",
|
|
42
|
+
total=len(train_tfrecords),
|
|
43
|
+
):
|
|
44
|
+
length = path.stem.split("-")[1]
|
|
45
|
+
new_name = f"ld_train{i:02d}-{length}.tfrec"
|
|
46
|
+
shutil.copy(path, public_dir / "train_tfrecords" / new_name)
|
|
47
|
+
|
|
48
|
+
(public_dir / "test_tfrecords").mkdir(exist_ok=True)
|
|
49
|
+
for i, path in tqdm(
|
|
50
|
+
enumerate(test_tfrecords),
|
|
51
|
+
desc=f"Copying Test TFRecords to {public_dir.name}",
|
|
52
|
+
total=len(test_tfrecords),
|
|
53
|
+
):
|
|
54
|
+
length = path.stem.split("-")[1]
|
|
55
|
+
new_name = f"ld_test{i:02d}-{length}.tfrec"
|
|
56
|
+
shutil.copy(path, public_dir / "test_tfrecords" / new_name)
|
|
57
|
+
|
|
58
|
+
# --- Copy image files ---
|
|
59
|
+
(public_dir / "train_images").mkdir(exist_ok=True)
|
|
60
|
+
for image_id in tqdm(
|
|
61
|
+
train_df["image_id"],
|
|
62
|
+
desc=f"Copying Train Images to {public_dir.name}",
|
|
63
|
+
total=len(train_df),
|
|
64
|
+
):
|
|
65
|
+
shutil.copy(raw_dir / "train_images" / image_id, public_dir / "train_images")
|
|
66
|
+
|
|
67
|
+
(public_dir / "test_images").mkdir(exist_ok=True)
|
|
68
|
+
for image_id in tqdm(
|
|
69
|
+
test_df["image_id"],
|
|
70
|
+
desc=f"Copying Test Images to {public_dir.name}",
|
|
71
|
+
total=len(test_df),
|
|
72
|
+
):
|
|
73
|
+
shutil.copy(raw_dir / "train_images" / image_id, public_dir / "test_images")
|
|
74
|
+
|
|
75
|
+
# --- Copy auxiliary files ---
|
|
76
|
+
shutil.copy(raw_dir / "label_num_to_disease_map.json", public_dir)
|
|
77
|
+
|
|
78
|
+
# --- Perform checks for this split ---
|
|
79
|
+
assert len(train_df) + len(test_df) == len(train_df) + len(
|
|
80
|
+
test_df
|
|
81
|
+
), f"Length check failed for {public_dir.name}"
|
|
82
|
+
assert len(sample_submission) == len(
|
|
83
|
+
test_df
|
|
84
|
+
), f"Sample submission length mismatch for {public_dir.name}"
|
|
85
|
+
|
|
86
|
+
assert len(train_df) == sum(
|
|
87
|
+
1 for _ in (public_dir / "train_images").iterdir()
|
|
88
|
+
), f"Train image count mismatch in {public_dir.name}"
|
|
89
|
+
assert len(test_df) == sum(
|
|
90
|
+
1 for _ in (public_dir / "test_images").iterdir()
|
|
91
|
+
), f"Test image count mismatch in {public_dir.name}"
|
|
92
|
+
|
|
93
|
+
assert len(train_tfrecords) == sum(
|
|
94
|
+
1 for _ in (public_dir / "train_tfrecords").iterdir()
|
|
95
|
+
), f"Train TFRecord count mismatch in {public_dir.name}"
|
|
96
|
+
assert len(test_tfrecords) == sum(
|
|
97
|
+
1 for _ in (public_dir / "test_tfrecords").iterdir()
|
|
98
|
+
), f"Test TFRecord count mismatch in {public_dir.name}"
|
|
99
|
+
|
|
100
|
+
assert train_df.columns.tolist() == [
|
|
101
|
+
"image_id",
|
|
102
|
+
"label",
|
|
103
|
+
], f"Train columns mismatch for {public_dir.name}"
|
|
104
|
+
assert test_df.columns.tolist() == [
|
|
105
|
+
"image_id",
|
|
106
|
+
"label",
|
|
107
|
+
], f"Test columns mismatch for {public_dir.name}"
|
|
108
|
+
assert sample_submission.columns.tolist() == [
|
|
109
|
+
"image_id",
|
|
110
|
+
"label",
|
|
111
|
+
], f"Sample submission columns mismatch for {public_dir.name}"
|
|
112
|
+
|
|
113
|
+
assert set(train_df["image_id"]).isdisjoint(
|
|
114
|
+
test_df["image_id"]
|
|
115
|
+
), f"Train and test image IDs are not disjoint for {public_dir.name}"
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def prepare(raw: Path, public: Path, private: Path):
|
|
119
|
+
# Define paths for the new validation split
|
|
120
|
+
public_val = public.parent / "public_val"
|
|
121
|
+
private_val = private.parent / "private_val"
|
|
122
|
+
|
|
123
|
+
# need to split based on the TFRecord files, since not mentioned in the CSVs
|
|
124
|
+
tfrecord_files = [
|
|
125
|
+
path
|
|
126
|
+
for path in sorted((raw / "train_tfrecords").iterdir())
|
|
127
|
+
if path.is_file() and path.suffix == ".tfrec"
|
|
128
|
+
]
|
|
129
|
+
|
|
130
|
+
# --- FIRST SPLIT: Create original train and test sets ---
|
|
131
|
+
# In the original there are 21397 train samples and they say test has ~15000 test samples, which is ~ 0.4/0.6 test/train split
|
|
132
|
+
# We use 0.1 ratio to avoid removing too many samples from train
|
|
133
|
+
train_tfrecords, test_tfrecords = train_test_split(
|
|
134
|
+
tfrecord_files, test_size=0.1, random_state=0
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# parse the IDs from the test tf records
|
|
138
|
+
test_ids = []
|
|
139
|
+
for path in test_tfrecords:
|
|
140
|
+
test_ids.extend(get_ids_from_tf_records(path))
|
|
141
|
+
|
|
142
|
+
# Create dataframes for the first split
|
|
143
|
+
full_train_df = read_csv(raw / "train.csv")
|
|
144
|
+
train_df = full_train_df[~full_train_df["image_id"].isin(test_ids)].copy()
|
|
145
|
+
test_df = full_train_df[full_train_df["image_id"].isin(test_ids)].copy()
|
|
146
|
+
|
|
147
|
+
# Process and save the original split to 'public' and 'private'
|
|
148
|
+
# This ensures the original outputs are untouched
|
|
149
|
+
_process_and_save_split(
|
|
150
|
+
train_df=train_df,
|
|
151
|
+
test_df=test_df,
|
|
152
|
+
train_tfrecords=train_tfrecords,
|
|
153
|
+
test_tfrecords=test_tfrecords,
|
|
154
|
+
public_dir=public,
|
|
155
|
+
private_dir=private,
|
|
156
|
+
raw_dir=raw,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
# --- SECOND SPLIT: Create new training and validation sets from the original train set ---
|
|
160
|
+
# The new training set from the first split becomes the source for this second split.
|
|
161
|
+
# A test_size of 1/9 on the train_tfrecords (which is 90% of the original data)
|
|
162
|
+
# results in a validation set that is 10% of the original total, matching the
|
|
163
|
+
# original test set size. (1/9 * 0.9 = 0.1)
|
|
164
|
+
train_val_tfrecords, test_val_tfrecords = train_test_split(
|
|
165
|
+
train_tfrecords, test_size=1 / 9, random_state=0
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
# Parse IDs for the validation set
|
|
169
|
+
test_val_ids = []
|
|
170
|
+
for path in test_val_tfrecords:
|
|
171
|
+
test_val_ids.extend(get_ids_from_tf_records(path))
|
|
172
|
+
|
|
173
|
+
# Create dataframes for the validation split using the original train_df
|
|
174
|
+
train_val_df = train_df[~train_df["image_id"].isin(test_val_ids)].copy()
|
|
175
|
+
test_val_df = train_df[train_df["image_id"].isin(test_val_ids)].copy()
|
|
176
|
+
|
|
177
|
+
# Process and save the validation split to 'public_val' and 'private_val'
|
|
178
|
+
_process_and_save_split(
|
|
179
|
+
train_df=train_val_df,
|
|
180
|
+
test_df=test_val_df,
|
|
181
|
+
train_tfrecords=train_val_tfrecords,
|
|
182
|
+
test_tfrecords=test_val_tfrecords,
|
|
183
|
+
public_dir=public_val,
|
|
184
|
+
private_dir=private_val,
|
|
185
|
+
raw_dir=raw,
|
|
186
|
+
)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from sklearn.metrics import accuracy_score
|
|
3
|
+
|
|
4
|
+
from mlebench.competitions.utils import prepare_for_accuracy_metric
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
|
|
8
|
+
accuracy_inputs = prepare_for_accuracy_metric(
|
|
9
|
+
submission=submission, answers=answers, target_column="category_id", id_column="_id"
|
|
10
|
+
)
|
|
11
|
+
return accuracy_score(**accuracy_inputs)
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
import shutil
|
|
2
|
+
from itertools import islice
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import bson
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from sklearn.model_selection import train_test_split
|
|
8
|
+
from tqdm import tqdm
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def prepare(raw: Path, public: Path, private: Path):
|
|
12
|
+
"""
|
|
13
|
+
Splits the data in raw into public and private datasets with appropriate test/train splits.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
dev_mode = False
|
|
17
|
+
|
|
18
|
+
def read_ids_and_category_ids(file_path: Path) -> pd.DataFrame:
|
|
19
|
+
data = bson.decode_file_iter(open(file_path, "rb"))
|
|
20
|
+
|
|
21
|
+
records = []
|
|
22
|
+
|
|
23
|
+
for c, d in enumerate(tqdm(data, desc="Reading BSON data")):
|
|
24
|
+
records.append({"_id": d["_id"], "category_id": d["category_id"]})
|
|
25
|
+
|
|
26
|
+
return pd.DataFrame(records)
|
|
27
|
+
|
|
28
|
+
def filter_bson_by_ids(
|
|
29
|
+
bson_file_path: Path,
|
|
30
|
+
ids: set,
|
|
31
|
+
write_path: Path,
|
|
32
|
+
exclude_cols: list = [],
|
|
33
|
+
chunk_size=1000,
|
|
34
|
+
max_rows=None,
|
|
35
|
+
):
|
|
36
|
+
"""
|
|
37
|
+
Filters a BSON file by a set of IDs and writes the filtered data to a new BSON file.
|
|
38
|
+
The original _id is replaced with a new _id starting from 0 and incrementing by 1.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
bson_file_path (Path): Path to the input BSON file.
|
|
42
|
+
ids (set): Set of IDs to filter by.
|
|
43
|
+
write_path (Path): Path to the output BSON file.
|
|
44
|
+
exclude_cols (list): List of columns to exclude from the output.
|
|
45
|
+
max_rows (int, optional): Maximum number of rows to write to the output file.
|
|
46
|
+
"""
|
|
47
|
+
data = bson.decode_file_iter(open(bson_file_path, "rb"))
|
|
48
|
+
num_written_rows = 0
|
|
49
|
+
|
|
50
|
+
with open(write_path, "wb") as f:
|
|
51
|
+
for record in tqdm(data, desc="Filtering BSON data"):
|
|
52
|
+
if record["_id"] in ids:
|
|
53
|
+
for col in exclude_cols:
|
|
54
|
+
if col in record:
|
|
55
|
+
del record[col]
|
|
56
|
+
num_written_rows += 1
|
|
57
|
+
f.write(bson.BSON.encode(record))
|
|
58
|
+
|
|
59
|
+
if num_written_rows % chunk_size == 0:
|
|
60
|
+
f.flush()
|
|
61
|
+
|
|
62
|
+
if max_rows is not None and num_written_rows >= max_rows:
|
|
63
|
+
break
|
|
64
|
+
|
|
65
|
+
# Create train, test from train split. Original train.bson contains 7,069,896 rows. Original test.bson contains 1,768,182 rows.
|
|
66
|
+
old_train = read_ids_and_category_ids(raw / "train.bson")
|
|
67
|
+
|
|
68
|
+
# Ensure rows in train_example remain in new_train
|
|
69
|
+
new_train, answers = train_test_split(old_train, test_size=0.1, random_state=0)
|
|
70
|
+
answers = answers.sort_values(by="_id")
|
|
71
|
+
|
|
72
|
+
# Create sample submission
|
|
73
|
+
sample_submission = answers[["_id"]]
|
|
74
|
+
sample_submission["category_id"] = 1000010653
|
|
75
|
+
|
|
76
|
+
# Checks
|
|
77
|
+
assert len(new_train) + len(answers) == len(
|
|
78
|
+
old_train
|
|
79
|
+
), f"The length of new_train and answers combined should be equal to the original length of old_train. Got {len(new_train) + len(answers)} and {len(old_train)}"
|
|
80
|
+
assert set(new_train["_id"]).isdisjoint(
|
|
81
|
+
set(answers["_id"])
|
|
82
|
+
), "new_train and answers should not have any _ids in common"
|
|
83
|
+
assert sample_submission.columns.tolist() == [
|
|
84
|
+
"_id",
|
|
85
|
+
"category_id",
|
|
86
|
+
], f"sample_submission should have columns _id and category_id. Got {sample_submission.columns.tolist()}"
|
|
87
|
+
|
|
88
|
+
# Write new files
|
|
89
|
+
answers.to_csv(private / "answers.csv", index=False)
|
|
90
|
+
sample_submission.to_csv(public / "sample_submission.csv", index=False)
|
|
91
|
+
|
|
92
|
+
filter_bson_by_ids(
|
|
93
|
+
bson_file_path=(
|
|
94
|
+
raw / "train_example.bson" if dev_mode else raw / "train.bson"
|
|
95
|
+
), # train_example.bson is the first 100 rows of train.bson
|
|
96
|
+
ids=set(new_train["_id"]),
|
|
97
|
+
write_path=public / "train.bson",
|
|
98
|
+
)
|
|
99
|
+
filter_bson_by_ids(
|
|
100
|
+
bson_file_path=raw / "train_example.bson" if dev_mode else raw / "train.bson",
|
|
101
|
+
ids=set(answers["_id"]),
|
|
102
|
+
write_path=public / "test.bson",
|
|
103
|
+
exclude_cols=["category_id"], # removes category_id for test.bson
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
# Write new train_example.bson which is the first 100 rows of the new train.bson
|
|
107
|
+
filter_bson_by_ids(
|
|
108
|
+
bson_file_path=(
|
|
109
|
+
raw / "train_example.bson" if dev_mode else raw / "train.bson"
|
|
110
|
+
), # train_example.bson is the first 100 rows of train.bson
|
|
111
|
+
ids=set(new_train["_id"]),
|
|
112
|
+
write_path=public / "train_example.bson",
|
|
113
|
+
max_rows=100,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
def is_valid_bson_file(file_path: Path, chunk_size: int = 10000):
|
|
117
|
+
try:
|
|
118
|
+
with open(file_path, "rb") as f:
|
|
119
|
+
data_iter = bson.decode_file_iter(f)
|
|
120
|
+
for chunk in tqdm(
|
|
121
|
+
iter(lambda: list(islice(data_iter, chunk_size)), []),
|
|
122
|
+
desc=f"Validating {file_path.name}",
|
|
123
|
+
):
|
|
124
|
+
pd.DataFrame(chunk) # Attempt to create a DataFrame from the chunk
|
|
125
|
+
except Exception as e:
|
|
126
|
+
return False
|
|
127
|
+
|
|
128
|
+
return True
|
|
129
|
+
|
|
130
|
+
# Check train.bson
|
|
131
|
+
assert is_valid_bson_file(public / "train.bson"), f"Couldn't parse `train.bson` as a bson file!"
|
|
132
|
+
|
|
133
|
+
# Check test.bson
|
|
134
|
+
assert is_valid_bson_file(public / "test.bson"), f"Couldn't parse `test.bson` as a bson file!"
|
|
135
|
+
|
|
136
|
+
# Copy over other files
|
|
137
|
+
shutil.copy(raw / "category_names.csv", public / "category_names.csv")
|
|
138
|
+
|
|
139
|
+
actual_new_train = read_ids_and_category_ids(public / "train.bson")
|
|
140
|
+
actual_new_train_example = read_ids_and_category_ids(public / "train_example.bson")
|
|
141
|
+
|
|
142
|
+
assert actual_new_train.iloc[:100].equals(
|
|
143
|
+
actual_new_train_example
|
|
144
|
+
), f"The first 100 rows of `train.bson` should be the same as `train_example.bson`"
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
import shutil
|
|
2
|
+
from itertools import islice
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import bson
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from sklearn.model_selection import train_test_split
|
|
8
|
+
from tqdm import tqdm
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def prepare(raw: Path, public: Path, private: Path):
|
|
12
|
+
"""
|
|
13
|
+
Splits the data in raw into public and private datasets with appropriate test/train splits.
|
|
14
|
+
Also creates a secondary validation split in public_val/private_val directories.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
dev_mode = False
|
|
18
|
+
|
|
19
|
+
def read_ids_and_category_ids(file_path: Path) -> pd.DataFrame:
|
|
20
|
+
data = bson.decode_file_iter(open(file_path, "rb"))
|
|
21
|
+
|
|
22
|
+
records = []
|
|
23
|
+
|
|
24
|
+
for c, d in enumerate(tqdm(data, desc="Reading BSON data")):
|
|
25
|
+
records.append({"_id": d["_id"], "category_id": d["category_id"]})
|
|
26
|
+
|
|
27
|
+
return pd.DataFrame(records)
|
|
28
|
+
|
|
29
|
+
def filter_bson_by_ids(
|
|
30
|
+
bson_file_path: Path,
|
|
31
|
+
ids: set,
|
|
32
|
+
write_path: Path,
|
|
33
|
+
exclude_cols: list = [],
|
|
34
|
+
chunk_size=1000,
|
|
35
|
+
max_rows=None,
|
|
36
|
+
):
|
|
37
|
+
"""
|
|
38
|
+
Filters a BSON file by a set of IDs and writes the filtered data to a new BSON file.
|
|
39
|
+
The original _id is replaced with a new _id starting from 0 and incrementing by 1.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
bson_file_path (Path): Path to the input BSON file.
|
|
43
|
+
ids (set): Set of IDs to filter by.
|
|
44
|
+
write_path (Path): Path to the output BSON file.
|
|
45
|
+
exclude_cols (list): List of columns to exclude from the output.
|
|
46
|
+
max_rows (int, optional): Maximum number of rows to write to the output file.
|
|
47
|
+
"""
|
|
48
|
+
data = bson.decode_file_iter(open(bson_file_path, "rb"))
|
|
49
|
+
num_written_rows = 0
|
|
50
|
+
|
|
51
|
+
with open(write_path, "wb") as f:
|
|
52
|
+
for record in tqdm(data, desc=f"Filtering BSON data for {write_path.name}"):
|
|
53
|
+
if record["_id"] in ids:
|
|
54
|
+
for col in exclude_cols:
|
|
55
|
+
if col in record:
|
|
56
|
+
del record[col]
|
|
57
|
+
num_written_rows += 1
|
|
58
|
+
f.write(bson.BSON.encode(record))
|
|
59
|
+
|
|
60
|
+
if num_written_rows % chunk_size == 0:
|
|
61
|
+
f.flush()
|
|
62
|
+
|
|
63
|
+
if max_rows is not None and num_written_rows >= max_rows:
|
|
64
|
+
break
|
|
65
|
+
|
|
66
|
+
def is_valid_bson_file(file_path: Path, chunk_size: int = 10000):
|
|
67
|
+
try:
|
|
68
|
+
with open(file_path, "rb") as f:
|
|
69
|
+
data_iter = bson.decode_file_iter(f)
|
|
70
|
+
for chunk in tqdm(
|
|
71
|
+
iter(lambda: list(islice(data_iter, chunk_size)), []),
|
|
72
|
+
desc=f"Validating {file_path.name}",
|
|
73
|
+
):
|
|
74
|
+
pd.DataFrame(chunk) # Attempt to create a DataFrame from the chunk
|
|
75
|
+
except Exception as e:
|
|
76
|
+
print(f"BSON validation failed for {file_path}: {e}")
|
|
77
|
+
return False
|
|
78
|
+
|
|
79
|
+
return True
|
|
80
|
+
|
|
81
|
+
def _process_and_write_split(
|
|
82
|
+
train_df: pd.DataFrame,
|
|
83
|
+
test_df: pd.DataFrame,
|
|
84
|
+
target_public_path: Path,
|
|
85
|
+
target_private_path: Path,
|
|
86
|
+
):
|
|
87
|
+
"""
|
|
88
|
+
Helper function to process a given train/test split and write all associated files
|
|
89
|
+
to the specified public and private directories.
|
|
90
|
+
"""
|
|
91
|
+
# Create output directories
|
|
92
|
+
target_public_path.mkdir(exist_ok=True)
|
|
93
|
+
target_private_path.mkdir(exist_ok=True)
|
|
94
|
+
|
|
95
|
+
# Sort test dataframe for consistency
|
|
96
|
+
answers = test_df.sort_values(by="_id")
|
|
97
|
+
|
|
98
|
+
# Create sample submission
|
|
99
|
+
sample_submission = answers[["_id"]]
|
|
100
|
+
sample_submission["category_id"] = 1000010653
|
|
101
|
+
|
|
102
|
+
# Basic integrity checks
|
|
103
|
+
assert set(train_df["_id"]).isdisjoint(
|
|
104
|
+
set(answers["_id"])
|
|
105
|
+
), "Train and test sets should not have any _ids in common"
|
|
106
|
+
assert sample_submission.columns.tolist() == [
|
|
107
|
+
"_id",
|
|
108
|
+
"category_id",
|
|
109
|
+
], f"sample_submission should have columns _id and category_id. Got {sample_submission.columns.tolist()}"
|
|
110
|
+
|
|
111
|
+
# Write new files
|
|
112
|
+
answers.to_csv(target_private_path / "answers.csv", index=False)
|
|
113
|
+
sample_submission.to_csv(target_public_path / "sample_submission.csv", index=False)
|
|
114
|
+
|
|
115
|
+
# Determine raw data source based on dev_mode
|
|
116
|
+
raw_bson_source = raw / "train_example.bson" if dev_mode else raw / "train.bson"
|
|
117
|
+
|
|
118
|
+
filter_bson_by_ids(
|
|
119
|
+
bson_file_path=raw_bson_source,
|
|
120
|
+
ids=set(train_df["_id"]),
|
|
121
|
+
write_path=target_public_path / "train.bson",
|
|
122
|
+
)
|
|
123
|
+
filter_bson_by_ids(
|
|
124
|
+
bson_file_path=raw_bson_source,
|
|
125
|
+
ids=set(answers["_id"]),
|
|
126
|
+
write_path=target_public_path / "test.bson",
|
|
127
|
+
exclude_cols=["category_id"],
|
|
128
|
+
)
|
|
129
|
+
filter_bson_by_ids(
|
|
130
|
+
bson_file_path=raw_bson_source,
|
|
131
|
+
ids=set(train_df["_id"]),
|
|
132
|
+
write_path=target_public_path / "train_example.bson",
|
|
133
|
+
max_rows=100,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
# Validate generated BSON files
|
|
137
|
+
assert is_valid_bson_file(target_public_path / "train.bson")
|
|
138
|
+
assert is_valid_bson_file(target_public_path / "test.bson")
|
|
139
|
+
|
|
140
|
+
# Copy over other files
|
|
141
|
+
shutil.copy(raw / "category_names.csv", target_public_path / "category_names.csv")
|
|
142
|
+
|
|
143
|
+
# Final check on train_example.bson content
|
|
144
|
+
actual_new_train = read_ids_and_category_ids(target_public_path / "train.bson")
|
|
145
|
+
actual_new_train_example = read_ids_and_category_ids(target_public_path / "train_example.bson")
|
|
146
|
+
|
|
147
|
+
assert actual_new_train.iloc[:100].equals(
|
|
148
|
+
actual_new_train_example
|
|
149
|
+
), f"The first 100 rows of `train.bson` should be the same as `train_example.bson` in {target_public_path}"
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# --- Main Script Logic ---
|
|
153
|
+
|
|
154
|
+
# Read the complete dataset IDs and categories
|
|
155
|
+
# Original train.bson contains 7,069,896 rows. Original test.bson contains 1,768,182 rows.
|
|
156
|
+
old_train = read_ids_and_category_ids(raw / "train.bson")
|
|
157
|
+
|
|
158
|
+
# === 1. Original Split: (train -> new_train + test) ===
|
|
159
|
+
# This split creates the primary competition data in `public` and `private`.
|
|
160
|
+
# This block is functionally identical to the original script to ensure outputs do not change.
|
|
161
|
+
print("--- Processing Original Split (public/private) ---")
|
|
162
|
+
new_train, answers = train_test_split(old_train, test_size=0.1, random_state=0)
|
|
163
|
+
|
|
164
|
+
assert len(new_train) + len(answers) == len(
|
|
165
|
+
old_train
|
|
166
|
+
), f"The length of new_train and answers combined should be equal to the original length of old_train. Got {len(new_train) + len(answers)} and {len(old_train)}"
|
|
167
|
+
|
|
168
|
+
_process_and_write_split(
|
|
169
|
+
train_df=new_train,
|
|
170
|
+
test_df=answers,
|
|
171
|
+
target_public_path=public,
|
|
172
|
+
target_private_path=private,
|
|
173
|
+
)
|
|
174
|
+
print("--- Original Split processing complete. ---")
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
# === 2. New Validation Split: (new_train -> train_val + test_val) ===
|
|
178
|
+
# This second split takes the `new_train` set from above and splits it again.
|
|
179
|
+
# The outputs are saved to new, parallel directories `public_val` and `private_val`.
|
|
180
|
+
print("\n--- Processing Validation Split (public_val/private_val) ---")
|
|
181
|
+
|
|
182
|
+
# Define paths for the new validation set
|
|
183
|
+
public_val = public.parent / "public_val"
|
|
184
|
+
private_val = private.parent / "private_val"
|
|
185
|
+
|
|
186
|
+
# To make the new `test_val` set have a similar size to the original `test` set (10% of total),
|
|
187
|
+
# we must take a fraction of `new_train`. Since `new_train` is 90% of the total,
|
|
188
|
+
# we take 1/9 of it to get our new test set. (1/9) * 0.9 = 0.1
|
|
189
|
+
test_val_size_fraction = 1 / 9.0
|
|
190
|
+
|
|
191
|
+
train_val, answers_val = train_test_split(
|
|
192
|
+
new_train, test_size=test_val_size_fraction, random_state=0
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
assert len(train_val) + len(answers_val) == len(
|
|
196
|
+
new_train
|
|
197
|
+
), "The validation split did not partition the new_train set correctly."
|
|
198
|
+
|
|
199
|
+
_process_and_write_split(
|
|
200
|
+
train_df=train_val,
|
|
201
|
+
test_df=answers_val,
|
|
202
|
+
target_public_path=public_val,
|
|
203
|
+
target_private_path=private_val,
|
|
204
|
+
)
|
|
205
|
+
print("--- Validation Split processing complete. ---")
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from pandas import DataFrame
|
|
2
|
+
from scipy.sparse import spmatrix
|
|
3
|
+
from sklearn.metrics import jaccard_score
|
|
4
|
+
from sklearn.preprocessing import MultiLabelBinarizer
|
|
5
|
+
|
|
6
|
+
from mlebench.grade_helpers import InvalidSubmissionError
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def binarize_sentences(y_true: list[set[str]], y_pred: list[set[str]]) -> tuple[spmatrix, spmatrix]:
|
|
10
|
+
"""
|
|
11
|
+
Binarizes lists of sentence sets so we can pass them as inputs to scipy.jaccard_score
|
|
12
|
+
"""
|
|
13
|
+
all_words = list(set.union(*y_true, *y_pred))
|
|
14
|
+
|
|
15
|
+
mlb = MultiLabelBinarizer(classes=all_words, sparse_output=True)
|
|
16
|
+
y_true_bin = mlb.fit_transform(y_true)
|
|
17
|
+
y_pred_bin = mlb.transform(y_pred)
|
|
18
|
+
|
|
19
|
+
return y_true_bin, y_pred_bin
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def prepare_for_grading(
|
|
23
|
+
submission: DataFrame, answers: DataFrame, target_column: str
|
|
24
|
+
) -> dict[str, spmatrix]:
|
|
25
|
+
"""
|
|
26
|
+
Extracts the sentences from the dataframe
|
|
27
|
+
Splits them into sets of lowercased words
|
|
28
|
+
Binarizes these sets so we can pass them as inputs to scipy.jaccard_score
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
if len(submission) != len(answers):
|
|
32
|
+
raise InvalidSubmissionError("Submission should be the same length as the answers")
|
|
33
|
+
if target_column not in submission.columns:
|
|
34
|
+
raise InvalidSubmissionError(f"Submission must have a `{target_column}` column")
|
|
35
|
+
if "id" not in submission.columns:
|
|
36
|
+
raise InvalidSubmissionError("Submission must have an `id` column")
|
|
37
|
+
assert target_column in answers.columns, f"Answers must have a `{target_column}` column"
|
|
38
|
+
assert "id" in answers.columns, "Answers must have an `id` column"
|
|
39
|
+
|
|
40
|
+
submission = submission.sort_values("id")
|
|
41
|
+
answers = answers.sort_values("id")
|
|
42
|
+
|
|
43
|
+
if (submission["id"].values != answers["id"].values).any():
|
|
44
|
+
raise InvalidSubmissionError("Submission does not contain same IDs as test")
|
|
45
|
+
|
|
46
|
+
# pandas reads empty cells as NaNs, which are float. We fill with empty string to match type
|
|
47
|
+
submission[target_column] = submission[target_column].fillna("")
|
|
48
|
+
answers[target_column] = answers[target_column].fillna("")
|
|
49
|
+
|
|
50
|
+
y_pred = submission[target_column].tolist()
|
|
51
|
+
y_true = answers[target_column].tolist()
|
|
52
|
+
|
|
53
|
+
# split into sets of words
|
|
54
|
+
y_pred_sentence_sets = [set(sentence.lower().split()) for sentence in y_pred]
|
|
55
|
+
y_true_sentence_sets = [set(sentence.lower().split()) for sentence in y_true]
|
|
56
|
+
|
|
57
|
+
# binarize for jaccard_score
|
|
58
|
+
y_true_bin, y_pred_bin = binarize_sentences(y_true_sentence_sets, y_pred_sentence_sets)
|
|
59
|
+
|
|
60
|
+
return {"y_pred": y_pred_bin, "y_true": y_true_bin}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def grade(submission: DataFrame, answers: DataFrame) -> float:
|
|
64
|
+
grading_input = prepare_for_grading(submission, answers, "PredictionString")
|
|
65
|
+
|
|
66
|
+
# we use average=`samples` to match what the competition description suggests
|
|
67
|
+
return jaccard_score(**grading_input, average="samples")
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from sklearn.model_selection import train_test_split
|
|
4
|
+
|
|
5
|
+
from mlebench.utils import read_csv
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def prepare(raw: Path, public: Path, private: Path):
|
|
9
|
+
|
|
10
|
+
# Create train, test from train split
|
|
11
|
+
old_train = read_csv(raw / "train.csv")
|
|
12
|
+
new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
|
|
13
|
+
new_test_without_labels = new_test.drop(columns=["answer_start", "answer_text"])
|
|
14
|
+
|
|
15
|
+
# make private test match submission format
|
|
16
|
+
new_test = new_test[["id", "answer_text"]]
|
|
17
|
+
new_test.columns = ["id", "PredictionString"]
|
|
18
|
+
|
|
19
|
+
# Copy over files
|
|
20
|
+
new_train.to_csv(public / "train.csv", index=False)
|
|
21
|
+
new_test_without_labels.to_csv(public / "test.csv", index=False)
|
|
22
|
+
new_test.to_csv(private / "test.csv", index=False)
|
|
23
|
+
|
|
24
|
+
# Create sample submission
|
|
25
|
+
sample_submission = new_test.copy()
|
|
26
|
+
sample_submission["PredictionString"] = "dummy text"
|
|
27
|
+
sample_submission.to_csv(public / "sample_submission.csv", index=False)
|
|
28
|
+
|
|
29
|
+
assert len(sample_submission) == len(
|
|
30
|
+
new_test
|
|
31
|
+
), "Sample submission length does not match test length."
|