dslighting 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (352) hide show
  1. dslighting/__init__.py +1 -1
  2. dslighting/core/agent.py +78 -62
  3. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/METADATA +1 -1
  4. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/RECORD +352 -7
  5. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/top_level.txt +1 -0
  6. mlebench/README.md +39 -0
  7. mlebench/__init__.py +0 -0
  8. mlebench/cli.py +221 -0
  9. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/grade.py +161 -0
  10. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/mAP_evaluation.py +425 -0
  11. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare.py +483 -0
  12. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare_val.py +719 -0
  13. mlebench/competitions/AI4Code/grade.py +70 -0
  14. mlebench/competitions/AI4Code/prepare.py +84 -0
  15. mlebench/competitions/AI4Code/prepare_val.py +159 -0
  16. mlebench/competitions/__init__.py +0 -0
  17. mlebench/competitions/aerial-cactus-identification/grade.py +11 -0
  18. mlebench/competitions/aerial-cactus-identification/prepare.py +71 -0
  19. mlebench/competitions/aerial-cactus-identification/prepare_val.py +133 -0
  20. mlebench/competitions/alaska2-image-steganalysis/grade.py +136 -0
  21. mlebench/competitions/alaska2-image-steganalysis/prepare.py +88 -0
  22. mlebench/competitions/alaska2-image-steganalysis/prepare_val.py +148 -0
  23. mlebench/competitions/aptos2019-blindness-detection/grade.py +35 -0
  24. mlebench/competitions/aptos2019-blindness-detection/prepare.py +75 -0
  25. mlebench/competitions/aptos2019-blindness-detection/prepare_val.py +123 -0
  26. mlebench/competitions/bike-sharing-demand/__init__.py +0 -0
  27. mlebench/competitions/bike-sharing-demand/grade.py +55 -0
  28. mlebench/competitions/bike-sharing-demand/prepare.py +37 -0
  29. mlebench/competitions/billion-word-imputation/grade.py +37 -0
  30. mlebench/competitions/billion-word-imputation/prepare.py +107 -0
  31. mlebench/competitions/billion-word-imputation/prepare_val.py +179 -0
  32. mlebench/competitions/bms-molecular-translation/grade.py +40 -0
  33. mlebench/competitions/bms-molecular-translation/prepare.py +68 -0
  34. mlebench/competitions/bms-molecular-translation/prepare_val.py +131 -0
  35. mlebench/competitions/cassava-leaf-disease-classification/grade.py +12 -0
  36. mlebench/competitions/cassava-leaf-disease-classification/prepare.py +113 -0
  37. mlebench/competitions/cassava-leaf-disease-classification/prepare_val.py +186 -0
  38. mlebench/competitions/cdiscount-image-classification-challenge/grade.py +11 -0
  39. mlebench/competitions/cdiscount-image-classification-challenge/prepare.py +144 -0
  40. mlebench/competitions/cdiscount-image-classification-challenge/prepare_val.py +205 -0
  41. mlebench/competitions/chaii-hindi-and-tamil-question-answering/grade.py +67 -0
  42. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare.py +31 -0
  43. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare_val.py +94 -0
  44. mlebench/competitions/champs-scalar-coupling/grade.py +60 -0
  45. mlebench/competitions/champs-scalar-coupling/prepare.py +116 -0
  46. mlebench/competitions/champs-scalar-coupling/prepare_val.py +155 -0
  47. mlebench/competitions/conways-reverse-game-of-life-2020/__init__.py +0 -0
  48. mlebench/competitions/conways-reverse-game-of-life-2020/grade.py +40 -0
  49. mlebench/competitions/conways-reverse-game-of-life-2020/prepare.py +41 -0
  50. mlebench/competitions/demand-forecasting-kernels-only/__init__.py +0 -0
  51. mlebench/competitions/demand-forecasting-kernels-only/grade.py +66 -0
  52. mlebench/competitions/demand-forecasting-kernels-only/prepare.py +27 -0
  53. mlebench/competitions/demand_forecasting_kernels_only/__init__.py +0 -0
  54. mlebench/competitions/demand_forecasting_kernels_only/grade.py +66 -0
  55. mlebench/competitions/demand_forecasting_kernels_only/prepare.py +27 -0
  56. mlebench/competitions/denoising-dirty-documents/grade.py +44 -0
  57. mlebench/competitions/denoising-dirty-documents/prepare.py +134 -0
  58. mlebench/competitions/denoising-dirty-documents/prepare_val.py +178 -0
  59. mlebench/competitions/detecting-insults-in-social-commentary/grade.py +11 -0
  60. mlebench/competitions/detecting-insults-in-social-commentary/prepare.py +72 -0
  61. mlebench/competitions/detecting-insults-in-social-commentary/prepare_val.py +128 -0
  62. mlebench/competitions/dog-breed-identification/dogs.py +124 -0
  63. mlebench/competitions/dog-breed-identification/grade.py +42 -0
  64. mlebench/competitions/dog-breed-identification/prepare.py +55 -0
  65. mlebench/competitions/dog-breed-identification/prepare_val.py +104 -0
  66. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/grade.py +43 -0
  67. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare.py +70 -0
  68. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare_val.py +143 -0
  69. mlebench/competitions/ethanol-concentration/grade.py +23 -0
  70. mlebench/competitions/ethanol-concentration/prepare.py +90 -0
  71. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/grade.py +60 -0
  72. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare.py +41 -0
  73. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare_val.py +92 -0
  74. mlebench/competitions/feedback-prize-english-language-learning/__init__.py +0 -0
  75. mlebench/competitions/feedback-prize-english-language-learning/grade.py +60 -0
  76. mlebench/competitions/feedback-prize-english-language-learning/prepare.py +39 -0
  77. mlebench/competitions/freesound-audio-tagging-2019/grade.py +64 -0
  78. mlebench/competitions/freesound-audio-tagging-2019/prepare.py +94 -0
  79. mlebench/competitions/freesound-audio-tagging-2019/prepare_val.py +175 -0
  80. mlebench/competitions/freesound-audio-tagging-2019/vocabulary.py +83 -0
  81. mlebench/competitions/google-quest-challenge/classes.py +32 -0
  82. mlebench/competitions/google-quest-challenge/grade.py +45 -0
  83. mlebench/competitions/google-quest-challenge/prepare.py +58 -0
  84. mlebench/competitions/google-quest-challenge/prepare_val.py +120 -0
  85. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/grade.py +77 -0
  86. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare.py +155 -0
  87. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare_val.py +211 -0
  88. mlebench/competitions/h-and-m-personalized-fashion-recommendations/grade.py +42 -0
  89. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare.py +102 -0
  90. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare_val.py +132 -0
  91. mlebench/competitions/handwriting/grade.py +23 -0
  92. mlebench/competitions/handwriting/prepare.py +179 -0
  93. mlebench/competitions/herbarium-2020-fgvc7/grade.py +34 -0
  94. mlebench/competitions/herbarium-2020-fgvc7/prepare.py +251 -0
  95. mlebench/competitions/herbarium-2020-fgvc7/prepare_val.py +242 -0
  96. mlebench/competitions/herbarium-2021-fgvc8/grade.py +34 -0
  97. mlebench/competitions/herbarium-2021-fgvc8/prepare.py +251 -0
  98. mlebench/competitions/herbarium-2021-fgvc8/prepare_val.py +222 -0
  99. mlebench/competitions/herbarium-2022-fgvc9/grade.py +31 -0
  100. mlebench/competitions/herbarium-2022-fgvc9/prepare.py +233 -0
  101. mlebench/competitions/herbarium-2022-fgvc9/prepare_val.py +213 -0
  102. mlebench/competitions/histopathologic-cancer-detection/grade.py +12 -0
  103. mlebench/competitions/histopathologic-cancer-detection/prepare.py +59 -0
  104. mlebench/competitions/histopathologic-cancer-detection/prepare_val.py +131 -0
  105. mlebench/competitions/hms-harmful-brain-activity-classification/constants.py +9 -0
  106. mlebench/competitions/hms-harmful-brain-activity-classification/grade.py +43 -0
  107. mlebench/competitions/hms-harmful-brain-activity-classification/kaggle_metric_utilities.py +96 -0
  108. mlebench/competitions/hms-harmful-brain-activity-classification/kullback_leibler_divergence.py +118 -0
  109. mlebench/competitions/hms-harmful-brain-activity-classification/prepare.py +121 -0
  110. mlebench/competitions/hms-harmful-brain-activity-classification/prepare_val.py +190 -0
  111. mlebench/competitions/hotel-id-2021-fgvc8/grade.py +41 -0
  112. mlebench/competitions/hotel-id-2021-fgvc8/prepare.py +63 -0
  113. mlebench/competitions/hotel-id-2021-fgvc8/prepare_val.py +132 -0
  114. mlebench/competitions/hubmap-kidney-segmentation/grade.py +62 -0
  115. mlebench/competitions/hubmap-kidney-segmentation/prepare.py +108 -0
  116. mlebench/competitions/hubmap-kidney-segmentation/prepare_val.py +153 -0
  117. mlebench/competitions/icecube-neutrinos-in-deep-ice/grade.py +111 -0
  118. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare.py +127 -0
  119. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare_val.py +183 -0
  120. mlebench/competitions/ili/grade.py +60 -0
  121. mlebench/competitions/ili/prepare.py +99 -0
  122. mlebench/competitions/imet-2020-fgvc7/grade.py +54 -0
  123. mlebench/competitions/imet-2020-fgvc7/prepare.py +77 -0
  124. mlebench/competitions/imet-2020-fgvc7/prepare_val.py +157 -0
  125. mlebench/competitions/inaturalist-2019-fgvc6/grade.py +35 -0
  126. mlebench/competitions/inaturalist-2019-fgvc6/prepare.py +259 -0
  127. mlebench/competitions/inaturalist-2019-fgvc6/prepare_val.py +304 -0
  128. mlebench/competitions/instant-gratification/__init__.py +0 -0
  129. mlebench/competitions/instant-gratification/grade.py +55 -0
  130. mlebench/competitions/instant-gratification/prepare.py +25 -0
  131. mlebench/competitions/instant_gratification/__init__.py +0 -0
  132. mlebench/competitions/instant_gratification/grade.py +55 -0
  133. mlebench/competitions/instant_gratification/prepare.py +25 -0
  134. mlebench/competitions/invasive-species-monitoring/grade.py +11 -0
  135. mlebench/competitions/invasive-species-monitoring/prepare.py +97 -0
  136. mlebench/competitions/invasive-species-monitoring/prepare_val.py +164 -0
  137. mlebench/competitions/iwildcam-2019-fgvc6/grade.py +44 -0
  138. mlebench/competitions/iwildcam-2019-fgvc6/prepare.py +118 -0
  139. mlebench/competitions/iwildcam-2019-fgvc6/prepare_val.py +194 -0
  140. mlebench/competitions/iwildcam-2020-fgvc7/grade.py +11 -0
  141. mlebench/competitions/iwildcam-2020-fgvc7/prepare.py +164 -0
  142. mlebench/competitions/iwildcam-2020-fgvc7/prepare_val.py +245 -0
  143. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/classes.py +1 -0
  144. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/grade.py +54 -0
  145. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare.py +42 -0
  146. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare_val.py +88 -0
  147. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/grade.py +153 -0
  148. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare.py +36 -0
  149. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare_val.py +117 -0
  150. mlebench/competitions/kuzushiji-recognition/grade.py +58 -0
  151. mlebench/competitions/kuzushiji-recognition/kuzushiji_metric.py +118 -0
  152. mlebench/competitions/kuzushiji-recognition/prepare.py +92 -0
  153. mlebench/competitions/kuzushiji-recognition/prepare_val.py +149 -0
  154. mlebench/competitions/leaf-classification/classes.py +101 -0
  155. mlebench/competitions/leaf-classification/grade.py +44 -0
  156. mlebench/competitions/leaf-classification/prepare.py +60 -0
  157. mlebench/competitions/leaf-classification/prepare_val.py +116 -0
  158. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/grade.py +44 -0
  159. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare.py +51 -0
  160. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare_val.py +96 -0
  161. mlebench/competitions/liverpool-ion-switching/__init__.py +0 -0
  162. mlebench/competitions/liverpool-ion-switching/grade.py +52 -0
  163. mlebench/competitions/liverpool-ion-switching/prepare.py +27 -0
  164. mlebench/competitions/liverpool_ion_switching/__init__.py +0 -0
  165. mlebench/competitions/liverpool_ion_switching/grade.py +52 -0
  166. mlebench/competitions/liverpool_ion_switching/prepare.py +27 -0
  167. mlebench/competitions/lmsys-chatbot-arena/grade.py +63 -0
  168. mlebench/competitions/lmsys-chatbot-arena/prepare.py +52 -0
  169. mlebench/competitions/lmsys-chatbot-arena/prepare_val.py +115 -0
  170. mlebench/competitions/mcm_2024_c_test/grade.py +107 -0
  171. mlebench/competitions/mcm_2024_c_test/prepare.py +2 -0
  172. mlebench/competitions/ml2021spring-hw2/grade.py +11 -0
  173. mlebench/competitions/ml2021spring-hw2/prepare.py +58 -0
  174. mlebench/competitions/ml2021spring-hw2/prepare_val.py +135 -0
  175. mlebench/competitions/mlsp-2013-birds/grade.py +11 -0
  176. mlebench/competitions/mlsp-2013-birds/prepare.py +182 -0
  177. mlebench/competitions/mlsp-2013-birds/prepare_val.py +241 -0
  178. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/grade.py +11 -0
  179. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare.py +58 -0
  180. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare_val.py +120 -0
  181. mlebench/competitions/multi-modal-gesture-recognition/grade.py +58 -0
  182. mlebench/competitions/multi-modal-gesture-recognition/prepare.py +85 -0
  183. mlebench/competitions/multi-modal-gesture-recognition/prepare_val.py +139 -0
  184. mlebench/competitions/my-custom-task-01/prepare.py +2 -0
  185. mlebench/competitions/new-my-task-01/prepare.py +2 -0
  186. mlebench/competitions/new-my-task-03/grade.py +107 -0
  187. mlebench/competitions/new-my-task-03/prepare.py +2 -0
  188. mlebench/competitions/new-york-city-taxi-fare-prediction/grade.py +28 -0
  189. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare.py +44 -0
  190. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare_val.py +89 -0
  191. mlebench/competitions/nfl-player-contact-detection/grade.py +36 -0
  192. mlebench/competitions/nfl-player-contact-detection/prepare.py +101 -0
  193. mlebench/competitions/nfl-player-contact-detection/prepare_val.py +186 -0
  194. mlebench/competitions/nomad2018-predict-transparent-conductors/grade.py +47 -0
  195. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare.py +77 -0
  196. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare_val.py +144 -0
  197. mlebench/competitions/osic-pulmonary-fibrosis-progression/grade.py +74 -0
  198. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare.py +95 -0
  199. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare_val.py +167 -0
  200. mlebench/competitions/paddy-disease-classification/grade.py +35 -0
  201. mlebench/competitions/paddy-disease-classification/prepare.py +69 -0
  202. mlebench/competitions/paddy-disease-classification/prepare_val.py +122 -0
  203. mlebench/competitions/petfinder-pawpularity-score/grade.py +41 -0
  204. mlebench/competitions/petfinder-pawpularity-score/prepare.py +76 -0
  205. mlebench/competitions/petfinder-pawpularity-score/prepare_val.py +154 -0
  206. mlebench/competitions/plant-pathology-2020-fgvc7/grade.py +41 -0
  207. mlebench/competitions/plant-pathology-2020-fgvc7/prepare.py +74 -0
  208. mlebench/competitions/plant-pathology-2020-fgvc7/prepare_val.py +160 -0
  209. mlebench/competitions/plant-pathology-2021-fgvc8/grade.py +54 -0
  210. mlebench/competitions/plant-pathology-2021-fgvc8/prepare.py +65 -0
  211. mlebench/competitions/plant-pathology-2021-fgvc8/prepare_val.py +130 -0
  212. mlebench/competitions/plant-seedlings-classification/grade.py +39 -0
  213. mlebench/competitions/plant-seedlings-classification/prepare.py +91 -0
  214. mlebench/competitions/plant-seedlings-classification/prepare_val.py +158 -0
  215. mlebench/competitions/playground-series-s3e1/__init__.py +0 -0
  216. mlebench/competitions/playground-series-s3e1/grade.py +52 -0
  217. mlebench/competitions/playground-series-s3e1/prepare.py +25 -0
  218. mlebench/competitions/playground-series-s3e11/__init__.py +0 -0
  219. mlebench/competitions/playground-series-s3e11/grade.py +55 -0
  220. mlebench/competitions/playground-series-s3e11/prepare.py +25 -0
  221. mlebench/competitions/playground-series-s3e18/grade.py +39 -0
  222. mlebench/competitions/playground-series-s3e18/prepare.py +36 -0
  223. mlebench/competitions/playground-series-s3e18/prepare_val.py +89 -0
  224. mlebench/competitions/playground_series_s3e1/__init__.py +0 -0
  225. mlebench/competitions/playground_series_s3e1/grade.py +52 -0
  226. mlebench/competitions/playground_series_s3e1/prepare.py +25 -0
  227. mlebench/competitions/playground_series_s3e11/__init__.py +0 -0
  228. mlebench/competitions/playground_series_s3e11/grade.py +55 -0
  229. mlebench/competitions/playground_series_s3e11/prepare.py +25 -0
  230. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/grade.py +44 -0
  231. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare.py +68 -0
  232. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare_val.py +146 -0
  233. mlebench/competitions/random-acts-of-pizza/grade.py +14 -0
  234. mlebench/competitions/random-acts-of-pizza/prepare.py +80 -0
  235. mlebench/competitions/random-acts-of-pizza/prepare_val.py +144 -0
  236. mlebench/competitions/ranzcr-clip-catheter-line-classification/classes.py +11 -0
  237. mlebench/competitions/ranzcr-clip-catheter-line-classification/grade.py +31 -0
  238. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare.py +53 -0
  239. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare_val.py +113 -0
  240. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/grade.py +124 -0
  241. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare.py +219 -0
  242. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare_val.py +257 -0
  243. mlebench/competitions/rsna-breast-cancer-detection/grade.py +65 -0
  244. mlebench/competitions/rsna-breast-cancer-detection/prepare.py +141 -0
  245. mlebench/competitions/rsna-breast-cancer-detection/prepare_val.py +201 -0
  246. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/grade.py +13 -0
  247. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare.py +47 -0
  248. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare_val.py +97 -0
  249. mlebench/competitions/santander-customer-satisfaction/grade.py +10 -0
  250. mlebench/competitions/santander-customer-satisfaction/prepare.py +41 -0
  251. mlebench/competitions/sciencebench-001-clintox-nn/__init__.py +0 -0
  252. mlebench/competitions/sciencebench-001-clintox-nn/grade.py +56 -0
  253. mlebench/competitions/sciencebench-001-clintox-nn/prepare.py +75 -0
  254. mlebench/competitions/sciencebench-015-aai/grade.py +37 -0
  255. mlebench/competitions/sciencebench-015-aai/prepare.py +102 -0
  256. mlebench/competitions/sciencebench-051-brain-blood-qsar/grade.py +58 -0
  257. mlebench/competitions/sciencebench-051-brain-blood-qsar/prepare.py +69 -0
  258. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/grade.py +55 -0
  259. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/prepare.py +88 -0
  260. mlebench/competitions/see-click-predict-fix/__init__.py +0 -0
  261. mlebench/competitions/see-click-predict-fix/grade.py +66 -0
  262. mlebench/competitions/see-click-predict-fix/prepare.py +25 -0
  263. mlebench/competitions/see_click_predict_fix/__init__.py +0 -0
  264. mlebench/competitions/see_click_predict_fix/grade.py +66 -0
  265. mlebench/competitions/see_click_predict_fix/prepare.py +25 -0
  266. mlebench/competitions/seti-breakthrough-listen/grade.py +11 -0
  267. mlebench/competitions/seti-breakthrough-listen/prepare.py +71 -0
  268. mlebench/competitions/seti-breakthrough-listen/prepare_val.py +159 -0
  269. mlebench/competitions/siim-covid19-detection/grade.py +194 -0
  270. mlebench/competitions/siim-covid19-detection/prepare.py +123 -0
  271. mlebench/competitions/siim-covid19-detection/prepare_val.py +164 -0
  272. mlebench/competitions/siim-isic-melanoma-classification/grade.py +11 -0
  273. mlebench/competitions/siim-isic-melanoma-classification/prepare.py +127 -0
  274. mlebench/competitions/siim-isic-melanoma-classification/prepare_val.py +158 -0
  275. mlebench/competitions/smartphone-decimeter-2022/grade.py +55 -0
  276. mlebench/competitions/smartphone-decimeter-2022/notebook.py +86 -0
  277. mlebench/competitions/smartphone-decimeter-2022/prepare.py +143 -0
  278. mlebench/competitions/smartphone-decimeter-2022/prepare_val.py +199 -0
  279. mlebench/competitions/spaceship-titanic/grade.py +11 -0
  280. mlebench/competitions/spaceship-titanic/prepare.py +23 -0
  281. mlebench/competitions/spaceship-titanic/prepare_val.py +61 -0
  282. mlebench/competitions/spooky-author-identification/classes.py +1 -0
  283. mlebench/competitions/spooky-author-identification/grade.py +38 -0
  284. mlebench/competitions/spooky-author-identification/prepare.py +40 -0
  285. mlebench/competitions/spooky-author-identification/prepare_val.py +78 -0
  286. mlebench/competitions/stanford-covid-vaccine/grade.py +65 -0
  287. mlebench/competitions/stanford-covid-vaccine/prepare.py +129 -0
  288. mlebench/competitions/stanford-covid-vaccine/prepare_val.py +199 -0
  289. mlebench/competitions/statoil-iceberg-classifier-challenge/grade.py +41 -0
  290. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare.py +105 -0
  291. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare_val.py +157 -0
  292. mlebench/competitions/tabular-playground-series-dec-2021/grade.py +11 -0
  293. mlebench/competitions/tabular-playground-series-dec-2021/prepare.py +39 -0
  294. mlebench/competitions/tabular-playground-series-dec-2021/prepare_val.py +99 -0
  295. mlebench/competitions/tabular-playground-series-may-2022/grade.py +9 -0
  296. mlebench/competitions/tabular-playground-series-may-2022/prepare.py +56 -0
  297. mlebench/competitions/tabular-playground-series-may-2022/prepare_val.py +116 -0
  298. mlebench/competitions/tensorflow-speech-recognition-challenge/grade.py +11 -0
  299. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare.py +90 -0
  300. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare_val.py +148 -0
  301. mlebench/competitions/tensorflow2-question-answering/grade.py +122 -0
  302. mlebench/competitions/tensorflow2-question-answering/prepare.py +122 -0
  303. mlebench/competitions/tensorflow2-question-answering/prepare_val.py +187 -0
  304. mlebench/competitions/text-normalization-challenge-english-language/grade.py +49 -0
  305. mlebench/competitions/text-normalization-challenge-english-language/prepare.py +115 -0
  306. mlebench/competitions/text-normalization-challenge-english-language/prepare_val.py +213 -0
  307. mlebench/competitions/text-normalization-challenge-russian-language/grade.py +49 -0
  308. mlebench/competitions/text-normalization-challenge-russian-language/prepare.py +113 -0
  309. mlebench/competitions/text-normalization-challenge-russian-language/prepare_val.py +165 -0
  310. mlebench/competitions/tgs-salt-identification-challenge/grade.py +144 -0
  311. mlebench/competitions/tgs-salt-identification-challenge/prepare.py +158 -0
  312. mlebench/competitions/tgs-salt-identification-challenge/prepare_val.py +166 -0
  313. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/grade.py +11 -0
  314. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare.py +95 -0
  315. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare_val.py +141 -0
  316. mlebench/competitions/tmdb-box-office-prediction/__init__.py +0 -0
  317. mlebench/competitions/tmdb-box-office-prediction/grade.py +55 -0
  318. mlebench/competitions/tmdb-box-office-prediction/prepare.py +35 -0
  319. mlebench/competitions/tweet-sentiment-extraction/grade.py +67 -0
  320. mlebench/competitions/tweet-sentiment-extraction/prepare.py +36 -0
  321. mlebench/competitions/tweet-sentiment-extraction/prepare_val.py +106 -0
  322. mlebench/competitions/us-patent-phrase-to-phrase-matching/grade.py +31 -0
  323. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare.py +33 -0
  324. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare_val.py +71 -0
  325. mlebench/competitions/utils.py +266 -0
  326. mlebench/competitions/uw-madison-gi-tract-image-segmentation/grade.py +158 -0
  327. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare.py +139 -0
  328. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare_val.py +193 -0
  329. mlebench/competitions/ventilator-pressure-prediction/__init__.py +0 -0
  330. mlebench/competitions/ventilator-pressure-prediction/grade.py +52 -0
  331. mlebench/competitions/ventilator-pressure-prediction/prepare.py +27 -0
  332. mlebench/competitions/ventilator-pressure-prediction/prepare_val.py +142 -0
  333. mlebench/competitions/ventilator_pressure_prediction/__init__.py +0 -0
  334. mlebench/competitions/ventilator_pressure_prediction/grade.py +52 -0
  335. mlebench/competitions/ventilator_pressure_prediction/prepare.py +27 -0
  336. mlebench/competitions/vesuvius-challenge-ink-detection/grade.py +97 -0
  337. mlebench/competitions/vesuvius-challenge-ink-detection/prepare.py +122 -0
  338. mlebench/competitions/vesuvius-challenge-ink-detection/prepare_val.py +170 -0
  339. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/grade.py +220 -0
  340. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare.py +129 -0
  341. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare_val.py +204 -0
  342. mlebench/competitions/whale-categorization-playground/grade.py +41 -0
  343. mlebench/competitions/whale-categorization-playground/prepare.py +103 -0
  344. mlebench/competitions/whale-categorization-playground/prepare_val.py +196 -0
  345. mlebench/data.py +420 -0
  346. mlebench/grade.py +209 -0
  347. mlebench/grade_helpers.py +235 -0
  348. mlebench/metrics.py +75 -0
  349. mlebench/registry.py +332 -0
  350. mlebench/utils.py +346 -0
  351. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/WHEEL +0 -0
  352. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,89 @@
1
+ import shutil
2
+ from pathlib import Path
3
+
4
+ import pandas as pd
5
+ from sklearn.model_selection import train_test_split
6
+
7
+ from mlebench.utils import read_csv
8
+
9
+
10
+ def _save_split(
11
+ train_df: pd.DataFrame, test_df: pd.DataFrame, public_path: Path, private_path: Path
12
+ ):
13
+ """
14
+ Saves a given train/test split to the specified public and private directories.
15
+
16
+ This helper function ensures a consistent file structure and naming convention
17
+ for any data split.
18
+
19
+ Args:
20
+ train_df: The training dataframe.
21
+ test_df: The testing dataframe (with labels).
22
+ public_path: The path to the public output directory.
23
+ private_path: The path to the private output directory.
24
+ """
25
+ # Ensure output directories exist
26
+ public_path.mkdir(parents=True, exist_ok=True)
27
+ private_path.mkdir(parents=True, exist_ok=True)
28
+
29
+ # Prepare dataframes for saving
30
+ test_without_labels = test_df.drop(columns=["fare_amount"])
31
+ submission_df = test_df.copy()[["key"]]
32
+ submission_df["fare_amount"] = 11.35 # A sample constant value
33
+
34
+ # Write CSVs to public and private directories
35
+ train_df.to_csv(public_path / "train.csv", index=False)
36
+ test_without_labels.to_csv(public_path / "test.csv", index=False)
37
+ submission_df.to_csv(public_path / "sample_submission.csv", index=False)
38
+ test_df.to_csv(private_path / "test.csv", index=False)
39
+
40
+ # Run checks to ensure data integrity
41
+ assert set(train_df["key"]).isdisjoint(
42
+ set(test_df["key"])
43
+ ), "Train and test sets share samples!"
44
+ assert test_df.shape[1] == 8, f"Test set should have 8 columns, but has {test_df.shape[1]}"
45
+ assert (
46
+ test_without_labels.shape[1] == 7
47
+ ), f"Test set without labels should have 7 columns, but has {test_without_labels.shape[1]}"
48
+ assert train_df.shape[1] == 8, f"Train set should have 8 columns, but has {train_df.shape[1]}"
49
+ assert (
50
+ submission_df.shape[1] == 2
51
+ ), f"Sample submission should have 2 columns, but has {submission_df.shape[1]}"
52
+ assert (
53
+ submission_df.shape[0] == test_df.shape[0]
54
+ ), f"Sample submission should have {test_df.shape[0]} rows, but has {submission_df.shape[0]}"
55
+
56
+
57
+ def prepare(raw: Path, public: Path, private: Path):
58
+ # Load the raw training data
59
+ old_train = read_csv(raw / "train.csv")
60
+
61
+ # --- 1. Create the original train/test split for the main competition files ---
62
+ # This split creates the primary train and test sets.
63
+ # The outputs in `public/` and `private/` will not be changed.
64
+ train_orig, test_orig = train_test_split(old_train, test_size=9914, random_state=0)
65
+
66
+ # Save the original split results
67
+ _save_split(
68
+ train_df=train_orig, test_df=test_orig, public_path=public, private_path=private
69
+ )
70
+
71
+ # Copy over other files to the original public directory
72
+ shutil.copy(raw / "GCP-Coupons-Instructions.rtf", public / "GCP-Coupons-Instructions.rtf")
73
+
74
+ # --- 2. Create the new train/validation split for local validation ---
75
+ # This second split takes the `train_orig` set and splits it again to create
76
+ # a smaller training set and a new validation set (`test_val`).
77
+ # The logic and test size are identical to the first split.
78
+ public_val = public.parent / "public_val"
79
+ private_val = private.parent / "private_val"
80
+
81
+ train_val, test_val = train_test_split(train_orig, test_size=9914, random_state=0)
82
+
83
+ # Save the validation split results to the new directories
84
+ _save_split(
85
+ train_df=train_val, test_df=test_val, public_path=public_val, private_path=private_val
86
+ )
87
+
88
+ # 复制相同的文件到 public_val 目录
89
+ shutil.copy(raw / "GCP-Coupons-Instructions.rtf", public_val / "GCP-Coupons-Instructions.rtf")
@@ -0,0 +1,36 @@
1
+ from typing import Tuple
2
+
3
+ import pandas as pd
4
+ from sklearn import metrics
5
+
6
+ from mlebench.grade_helpers import InvalidSubmissionError
7
+
8
+
9
+ def prepare_for_metric(
10
+ submission: pd.DataFrame, answers: pd.DataFrame
11
+ ) -> Tuple[pd.DataFrame, pd.DataFrame]:
12
+ assert "contact_id" in answers.columns, "contact_id column not found in answers"
13
+ assert "contact" in answers.columns, "contact column not found in answers"
14
+ if not {"contact_id", "contact"}.issubset(set(submission.columns)):
15
+ raise InvalidSubmissionError("Submission must have columns: contact_id, contact")
16
+
17
+ if not set(submission["contact_id"]) == set(answers["contact_id"]):
18
+ raise InvalidSubmissionError(
19
+ "Submission contact_id's inconsistent with answers contact_id's"
20
+ )
21
+
22
+ if len(submission) != len(answers):
23
+ raise InvalidSubmissionError(
24
+ f"Submission and answers have different lengths: submission has {len(submission)} samples, answers has {len(answers)} samples"
25
+ )
26
+
27
+ submission = submission.sort_values("contact_id")
28
+ answers = answers.sort_values("contact_id")
29
+
30
+ return submission, answers
31
+
32
+
33
+ def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
34
+ submission, answers = prepare_for_metric(submission, answers)
35
+
36
+ return metrics.matthews_corrcoef(answers["contact"], submission["contact"])
@@ -0,0 +1,101 @@
1
+ import shutil
2
+ from pathlib import Path
3
+
4
+ import pandas as pd
5
+ from sklearn.model_selection import train_test_split
6
+ from tqdm import tqdm
7
+
8
+
9
+ def prepare(raw: Path, public: Path, private: Path):
10
+ (public / "train").mkdir(exist_ok=True)
11
+ (public / "test").mkdir(exist_ok=True)
12
+
13
+ # Create train, test from train split. Ensure train, test come from different game plays
14
+ old_train = pd.read_csv(raw / "train_labels.csv")
15
+ unique_game_play = old_train["game_play"].unique()
16
+ new_train_game_play, new_test_game_play = train_test_split(
17
+ unique_game_play, test_size=0.1, random_state=0
18
+ )
19
+
20
+ new_train = old_train[old_train["game_play"].isin(new_train_game_play)]
21
+ new_test = old_train[old_train["game_play"].isin(new_test_game_play)]
22
+ assert set(new_train["contact_id"]).isdisjoint(
23
+ set(new_test["contact_id"])
24
+ ), "Train and test label share samples!"
25
+
26
+ new_train.to_csv(public / "train_labels.csv", index=False)
27
+ new_test.to_csv(private / "test.csv", index=False)
28
+
29
+ # baseline helmets
30
+ old_train_baseline_helmets = pd.read_csv(raw / "train_baseline_helmets.csv")
31
+ new_train_baseline_helmets = old_train_baseline_helmets[
32
+ old_train_baseline_helmets["game_play"].isin(new_train_game_play)
33
+ ]
34
+ new_test_baseline_helmets = old_train_baseline_helmets[
35
+ old_train_baseline_helmets["game_play"].isin(new_test_game_play)
36
+ ]
37
+
38
+ new_train_baseline_helmets.to_csv(public / "train_baseline_helmets.csv", index=False)
39
+ new_test_baseline_helmets.to_csv(public / "test_baseline_helmets.csv", index=False)
40
+
41
+ # player tracking
42
+ old_train_player_tracking = pd.read_csv(raw / "train_player_tracking.csv")
43
+ new_train_player_trackings = old_train_player_tracking[
44
+ old_train_player_tracking["game_play"].isin(new_train_game_play)
45
+ ]
46
+ new_test_player_trackings = old_train_player_tracking[
47
+ old_train_player_tracking["game_play"].isin(new_test_game_play)
48
+ ]
49
+
50
+ new_train_player_trackings.to_csv(public / "train_player_tracking.csv", index=False)
51
+ new_test_player_trackings.to_csv(public / "test_player_tracking.csv", index=False)
52
+
53
+ # video metadata
54
+ old_train_video_metadata = pd.read_csv(raw / "train_video_metadata.csv")
55
+ new_train_video_metadata = old_train_video_metadata[
56
+ old_train_video_metadata["game_play"].isin(new_train_game_play)
57
+ ]
58
+ new_test_video_metadata = old_train_video_metadata[
59
+ old_train_video_metadata["game_play"].isin(new_test_game_play)
60
+ ]
61
+
62
+ new_train_video_metadata.to_csv(public / "train_video_metadata.csv", index=False)
63
+ new_test_video_metadata.to_csv(public / "test_video_metadata.csv", index=False)
64
+
65
+ # Copy over videos
66
+ for game_play_type in ["All29", "Endzone", "Sideline"]:
67
+ for game_play in new_train["game_play"].unique():
68
+ shutil.copyfile(
69
+ src=raw / "train" / f"{game_play}_{game_play_type}.mp4",
70
+ dst=public / "train" / f"{game_play}_{game_play_type}.mp4",
71
+ )
72
+
73
+ for game_play in new_test["game_play"].unique():
74
+ shutil.copyfile(
75
+ src=raw / "train" / f"{game_play}_{game_play_type}.mp4",
76
+ dst=public / "test" / f"{game_play}_{game_play_type}.mp4",
77
+ )
78
+
79
+ # Check integrity of the files copied
80
+ num_train_videos_found = len(list(public.glob("train/*.mp4")))
81
+ num_test_videos_found = len(list(public.glob("test/*.mp4")))
82
+ num_expected_train_videos = (
83
+ len(new_train["game_play"].unique()) * 3
84
+ ) # *3 for All29, Endzone, Sideline
85
+ num_expected_test_videos = len(new_test["game_play"].unique()) * 3
86
+
87
+ assert (
88
+ num_train_videos_found == num_expected_train_videos
89
+ ), f"Expected {num_expected_train_videos} images, found {num_train_videos_found}"
90
+ assert (
91
+ num_test_videos_found == num_expected_test_videos
92
+ ), f"Expected {num_expected_test_videos} images, found {num_test_videos_found}"
93
+
94
+ # Create a sample submission file
95
+ submission_df = pd.DataFrame(
96
+ {
97
+ "contact_id": new_test["contact_id"],
98
+ "contact": 0,
99
+ }
100
+ )
101
+ submission_df.to_csv(public / "sample_submission.csv", index=False)
@@ -0,0 +1,186 @@
1
+ import shutil
2
+ from pathlib import Path
3
+
4
+ import pandas as pd
5
+ from sklearn.model_selection import train_test_split
6
+ from tqdm import tqdm
7
+
8
+
9
+ def _process_and_save_split(
10
+ train_game_play_ids: list,
11
+ test_game_play_ids: list,
12
+ raw_path: Path,
13
+ public_path: Path,
14
+ private_path: Path,
15
+ old_train_labels_df: pd.DataFrame,
16
+ old_train_baseline_helmets_df: pd.DataFrame,
17
+ old_train_player_tracking_df: pd.DataFrame,
18
+ old_train_video_metadata_df: pd.DataFrame,
19
+ ):
20
+ """
21
+ A helper function to process and save a single data split (e.g., train/test or train_val/test_val).
22
+
23
+ This function filters raw dataframes based on game play IDs, saves the resulting CSVs,
24
+ copies the corresponding video files, and creates a sample submission file.
25
+ The output filenames are fixed to ensure consistent structure across different splits.
26
+ """
27
+ public_path.mkdir(exist_ok=True, parents=True)
28
+ private_path.mkdir(exist_ok=True, parents=True)
29
+ (public_path / "train").mkdir(exist_ok=True)
30
+ (public_path / "test").mkdir(exist_ok=True)
31
+
32
+ # Filter and save train/test labels
33
+ new_train = old_train_labels_df[
34
+ old_train_labels_df["game_play"].isin(train_game_play_ids)
35
+ ]
36
+ new_test = old_train_labels_df[
37
+ old_train_labels_df["game_play"].isin(test_game_play_ids)
38
+ ]
39
+ assert set(new_train["contact_id"]).isdisjoint(
40
+ set(new_test["contact_id"])
41
+ ), "Train and test label share samples!"
42
+
43
+ new_train.to_csv(public_path / "train_labels.csv", index=False)
44
+ new_test.to_csv(private_path / "test.csv", index=False)
45
+
46
+ # Filter and save baseline helmets
47
+ new_train_baseline_helmets = old_train_baseline_helmets_df[
48
+ old_train_baseline_helmets_df["game_play"].isin(train_game_play_ids)
49
+ ]
50
+ new_test_baseline_helmets = old_train_baseline_helmets_df[
51
+ old_train_baseline_helmets_df["game_play"].isin(test_game_play_ids)
52
+ ]
53
+
54
+ new_train_baseline_helmets.to_csv(
55
+ public_path / "train_baseline_helmets.csv", index=False
56
+ )
57
+ new_test_baseline_helmets.to_csv(
58
+ public_path / "test_baseline_helmets.csv", index=False
59
+ )
60
+
61
+ # Filter and save player tracking
62
+ new_train_player_trackings = old_train_player_tracking_df[
63
+ old_train_player_tracking_df["game_play"].isin(train_game_play_ids)
64
+ ]
65
+ new_test_player_trackings = old_train_player_tracking_df[
66
+ old_train_player_tracking_df["game_play"].isin(test_game_play_ids)
67
+ ]
68
+
69
+ new_train_player_trackings.to_csv(
70
+ public_path / "train_player_tracking.csv", index=False
71
+ )
72
+ new_test_player_trackings.to_csv(
73
+ public_path / "test_player_tracking.csv", index=False
74
+ )
75
+
76
+ # Filter and save video metadata
77
+ new_train_video_metadata = old_train_video_metadata_df[
78
+ old_train_video_metadata_df["game_play"].isin(train_game_play_ids)
79
+ ]
80
+ new_test_video_metadata = old_train_video_metadata_df[
81
+ old_train_video_metadata_df["game_play"].isin(test_game_play_ids)
82
+ ]
83
+
84
+ new_train_video_metadata.to_csv(
85
+ public_path / "train_video_metadata.csv", index=False
86
+ )
87
+ new_test_video_metadata.to_csv(
88
+ public_path / "test_video_metadata.csv", index=False
89
+ )
90
+
91
+ # Copy over videos
92
+ print(f"Copying videos to {public_path.name}...")
93
+ for game_play_type in ["All29", "Endzone", "Sideline"]:
94
+ for game_play in tqdm(
95
+ new_train["game_play"].unique(),
96
+ desc=f"Copying train videos ({game_play_type})",
97
+ ):
98
+ shutil.copyfile(
99
+ src=raw_path / "train" / f"{game_play}_{game_play_type}.mp4",
100
+ dst=public_path / "train" / f"{game_play}_{game_play_type}.mp4",
101
+ )
102
+
103
+ for game_play in tqdm(
104
+ new_test["game_play"].unique(),
105
+ desc=f"Copying test videos ({game_play_type})",
106
+ ):
107
+ shutil.copyfile(
108
+ src=raw_path / "train" / f"{game_play}_{game_play_type}.mp4",
109
+ dst=public_path / "test" / f"{game_play}_{game_play_type}.mp4",
110
+ )
111
+
112
+ # Check integrity of the files copied
113
+ num_train_videos_found = len(list(public_path.glob("train/*.mp4")))
114
+ num_test_videos_found = len(list(public_path.glob("test/*.mp4")))
115
+ num_expected_train_videos = len(new_train["game_play"].unique()) * 3
116
+ num_expected_test_videos = len(new_test["game_play"].unique()) * 3
117
+
118
+ assert (
119
+ num_train_videos_found == num_expected_train_videos
120
+ ), f"Expected {num_expected_train_videos} images, found {num_train_videos_found}"
121
+ assert (
122
+ num_test_videos_found == num_expected_test_videos
123
+ ), f"Expected {num_expected_test_videos} images, found {num_test_videos_found}"
124
+
125
+ # Create a sample submission file
126
+ submission_df = pd.DataFrame(
127
+ {
128
+ "contact_id": new_test["contact_id"],
129
+ "contact": 0,
130
+ }
131
+ )
132
+ submission_df.to_csv(public_path / "sample_submission.csv", index=False)
133
+
134
+
135
+ def prepare(raw: Path, public: Path, private: Path):
136
+ # Load all raw dataframes once to improve efficiency
137
+ old_train_labels = pd.read_csv(raw / "train_labels.csv")
138
+ old_train_baseline_helmets = pd.read_csv(raw / "train_baseline_helmets.csv")
139
+ old_train_player_tracking = pd.read_csv(raw / "train_player_tracking.csv")
140
+ old_train_video_metadata = pd.read_csv(raw / "train_video_metadata.csv")
141
+
142
+ # --- Original Data Split (Train/Test) ---
143
+ # Create train, test from train split. Ensure train, test come from different game plays
144
+ unique_game_play = old_train_labels["game_play"].unique()
145
+ new_train_game_play, new_test_game_play = train_test_split(
146
+ unique_game_play, test_size=0.1, random_state=0
147
+ )
148
+
149
+ print("--- Processing original train/test split ---")
150
+ _process_and_save_split(
151
+ train_game_play_ids=new_train_game_play,
152
+ test_game_play_ids=new_test_game_play,
153
+ raw_path=raw,
154
+ public_path=public,
155
+ private_path=private,
156
+ old_train_labels_df=old_train_labels,
157
+ old_train_baseline_helmets_df=old_train_baseline_helmets,
158
+ old_train_player_tracking_df=old_train_player_tracking,
159
+ old_train_video_metadata_df=old_train_video_metadata,
160
+ )
161
+ print("--- Original split processing complete. ---\n")
162
+
163
+ # --- New Validation Data Split (Train_val/Test_val) ---
164
+ # Define new paths for the validation split
165
+ public_val = public.parent / "public_val"
166
+ private_val = private.parent / "private_val"
167
+
168
+ # Split the *training set* again to create a validation set.
169
+ # Use the same logic and random_state for consistency.
170
+ train_val_game_play, test_val_game_play = train_test_split(
171
+ new_train_game_play, test_size=0.1, random_state=0
172
+ )
173
+
174
+ print("--- Processing validation train/test split ---")
175
+ _process_and_save_split(
176
+ train_game_play_ids=train_val_game_play,
177
+ test_game_play_ids=test_val_game_play,
178
+ raw_path=raw,
179
+ public_path=public_val,
180
+ private_path=private_val,
181
+ old_train_labels_df=old_train_labels,
182
+ old_train_baseline_helmets_df=old_train_baseline_helmets,
183
+ old_train_player_tracking_df=old_train_player_tracking,
184
+ old_train_video_metadata_df=old_train_video_metadata,
185
+ )
186
+ print("--- Validation split processing complete. ---")
@@ -0,0 +1,47 @@
1
+ from typing import Tuple
2
+
3
+ import pandas as pd
4
+ from sklearn import metrics
5
+
6
+ from mlebench.grade_helpers import InvalidSubmissionError
7
+
8
+
9
+ def prepare_for_metric(
10
+ submission: pd.DataFrame, answers: pd.DataFrame
11
+ ) -> Tuple[pd.Series, pd.Series, pd.Series, pd.Series]:
12
+ assert (
13
+ "formation_energy_ev_natom" in answers.columns
14
+ ), "Expected 'formation_energy_ev_natom' column in answers"
15
+ assert "bandgap_energy_ev" in answers.columns, "Expected 'bandgap_energy_ev' column in answers"
16
+ if "formation_energy_ev_natom" not in submission.columns:
17
+ raise InvalidSubmissionError("Expected 'formation_energy_ev_natom' column in submission")
18
+ if "bandgap_energy_ev" not in submission.columns:
19
+ raise InvalidSubmissionError("Expected 'bandgap_energy_ev' column in submission")
20
+ if len(submission) != len(answers):
21
+ raise InvalidSubmissionError(
22
+ f"Expected {len(answers)} rows in submission, got {len(submission)}"
23
+ )
24
+
25
+ true_labels_formation, true_labels_bandgap = (
26
+ answers["formation_energy_ev_natom"],
27
+ answers["bandgap_energy_ev"],
28
+ )
29
+ predictions_formation, predictions_bandgap = (
30
+ submission["formation_energy_ev_natom"],
31
+ submission["bandgap_energy_ev"],
32
+ )
33
+
34
+ return true_labels_formation, true_labels_bandgap, predictions_formation, predictions_bandgap
35
+
36
+
37
+ def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
38
+ (
39
+ true_labels_formation,
40
+ true_labels_bandgap,
41
+ predictions_formation,
42
+ predictions_bandgap,
43
+ ) = prepare_for_metric(submission, answers)
44
+ return (
45
+ metrics.root_mean_squared_log_error(true_labels_formation, predictions_formation)
46
+ + metrics.root_mean_squared_log_error(true_labels_bandgap, predictions_bandgap)
47
+ ) / 2
@@ -0,0 +1,77 @@
1
+ import glob
2
+ import shutil
3
+ from pathlib import Path
4
+
5
+ import pandas as pd
6
+ from sklearn.model_selection import train_test_split
7
+
8
+ from mlebench.utils import extract, read_csv
9
+
10
+
11
+ def prepare(raw: Path, public: Path, private: Path):
12
+ """
13
+ Splits the data in raw into public and private datasets with appropriate test/train splits.
14
+ """
15
+ # Extract only what we need
16
+ extract(raw / "train.zip", raw / "train")
17
+ extract(raw / "train.csv.zip", raw / "train.csv")
18
+ extract(raw / "test.zip", raw / "test")
19
+ extract(raw / "test.csv.zip", raw / "test.csv")
20
+
21
+ # Create train, test from train split
22
+ old_train = read_csv(raw / "train.csv/train.csv")
23
+ new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
24
+
25
+ # Make ids go 1, 2, ... for both train and test. Keep old ids so we can map ids of other files
26
+ old_train_id_to_new = {
27
+ old_id: new_id for new_id, old_id in enumerate(new_train["id"], start=1)
28
+ } # id starts from 1
29
+ new_train["id"] = new_train["id"].map(old_train_id_to_new)
30
+
31
+ old_test_id_to_new = {
32
+ old_id: new_id for new_id, old_id in enumerate(new_test["id"], start=1)
33
+ } # id starts from 1
34
+ new_test["id"] = new_test["id"].map(old_test_id_to_new)
35
+
36
+ new_test_without_labels = new_test.drop(
37
+ columns=["formation_energy_ev_natom", "bandgap_energy_ev"]
38
+ )
39
+
40
+ # Copy over files
41
+ new_train.to_csv(public / "train.csv", index=False)
42
+ new_test.to_csv(private / "test.csv", index=False)
43
+ new_test_without_labels.to_csv(public / "test.csv", index=False)
44
+
45
+ train_paths = sorted(glob.glob(str(raw / "train/train/**/*.xyz")))
46
+ for src in train_paths:
47
+ id = int(Path(src).parts[-2])
48
+ if id not in old_train_id_to_new.keys(): # Filter for train ids
49
+ continue
50
+
51
+ new_id = old_train_id_to_new[id]
52
+ (public / "train" / str(new_id)).mkdir(parents=True, exist_ok=True)
53
+ shutil.copy(src=src, dst=public / "train" / str(new_id) / "geometry.xyz")
54
+ assert len(list(public.glob("train/**/*.xyz"))) == len(
55
+ new_train
56
+ ), f"Expected {len(new_train)} train geometry files, found {len(list(public.glob('train/**/*.xyz')))}"
57
+
58
+ for src in train_paths:
59
+ id = int(Path(src).parts[-2])
60
+ if id not in old_test_id_to_new.keys(): # Filter for test ids
61
+ continue
62
+
63
+ new_id = old_test_id_to_new[id]
64
+ (public / "test" / str(new_id)).mkdir(parents=True, exist_ok=True)
65
+ shutil.copy(src=src, dst=public / "test" / str(new_id) / "geometry.xyz")
66
+ assert len(list(public.glob("test/**/*.xyz"))) == len(
67
+ new_test
68
+ ), f"Expected {len(new_test)} test geometry files, found {len(list(public.glob('test/**/*.xyz')))}"
69
+
70
+ # Create mock submission
71
+ sample_submission = pd.DataFrame(
72
+ {"id": new_test["id"], "formation_energy_ev_natom": 0.1779, "bandgap_energy_ev": 1.8892}
73
+ )
74
+ sample_submission.to_csv(public / "sample_submission.csv", index=False)
75
+ assert len(sample_submission) == len(
76
+ new_test
77
+ ), "Sample submission should have the same number of rows as the test set"
@@ -0,0 +1,144 @@
1
+ import glob
2
+ import shutil
3
+ from pathlib import Path
4
+
5
+ import pandas as pd
6
+ from sklearn.model_selection import train_test_split
7
+
8
+ from mlebench.utils import extract, read_csv
9
+
10
+
11
+ def _process_split(
12
+ source_df: pd.DataFrame,
13
+ test_size: float,
14
+ random_state: int,
15
+ source_geometry_paths: list,
16
+ public_dir: Path,
17
+ private_dir: Path,
18
+ ) -> pd.DataFrame:
19
+ """
20
+ Helper function to perform a data split, re-index, and write all necessary files.
21
+
22
+ Args:
23
+ source_df: The DataFrame to split.
24
+ test_size: The proportion of the dataset to allocate to the test split.
25
+ random_state: The seed used by the random number generator.
26
+ source_geometry_paths: A list of paths to all original geometry files.
27
+ public_dir: The destination directory for public-facing files (train set, test features).
28
+ private_dir: The destination directory for private-facing files (test labels).
29
+
30
+ Returns:
31
+ The created training DataFrame, which can be used for a subsequent split.
32
+ """
33
+ # Ensure destination directories exist
34
+ public_dir.mkdir(parents=True, exist_ok=True)
35
+ private_dir.mkdir(parents=True, exist_ok=True)
36
+
37
+ # Create train, test from the source dataframe
38
+ new_train, new_test = train_test_split(
39
+ source_df, test_size=test_size, random_state=random_state
40
+ )
41
+
42
+ # Make ids go 1, 2, ... for both train and test. Keep old ids so we can map ids of other files
43
+ old_train_id_to_new = {
44
+ old_id: new_id for new_id, old_id in enumerate(new_train["id"], start=1)
45
+ } # id starts from 1
46
+ new_train["id"] = new_train["id"].map(old_train_id_to_new)
47
+
48
+ old_test_id_to_new = {
49
+ old_id: new_id for new_id, old_id in enumerate(new_test["id"], start=1)
50
+ } # id starts from 1
51
+ new_test["id"] = new_test["id"].map(old_test_id_to_new)
52
+
53
+ new_test_without_labels = new_test.drop(
54
+ columns=["formation_energy_ev_natom", "bandgap_energy_ev"]
55
+ )
56
+
57
+ # Copy over files
58
+ new_train.to_csv(public_dir / "train.csv", index=False)
59
+ new_test.to_csv(private_dir / "test.csv", index=False)
60
+ new_test_without_labels.to_csv(public_dir / "test.csv", index=False)
61
+
62
+ # --- Process and copy geometry files for the new train set ---
63
+ train_geometry_dir = public_dir / "train"
64
+ for src in source_geometry_paths:
65
+ original_id = int(Path(src).parts[-2])
66
+ if original_id not in old_train_id_to_new: # Filter for train ids
67
+ continue
68
+
69
+ new_id = old_train_id_to_new[original_id]
70
+ dest_dir = train_geometry_dir / str(new_id)
71
+ dest_dir.mkdir(parents=True, exist_ok=True)
72
+ shutil.copy(src=src, dst=dest_dir / "geometry.xyz")
73
+ assert len(list(train_geometry_dir.glob("**/*.xyz"))) == len(
74
+ new_train
75
+ ), f"Expected {len(new_train)} train geometry files in {public_dir}, found {len(list(train_geometry_dir.glob('**/*.xyz')))}"
76
+
77
+ # --- Process and copy geometry files for the new test set ---
78
+ test_geometry_dir = public_dir / "test"
79
+ for src in source_geometry_paths:
80
+ original_id = int(Path(src).parts[-2])
81
+ if original_id not in old_test_id_to_new: # Filter for test ids
82
+ continue
83
+
84
+ new_id = old_test_id_to_new[original_id]
85
+ dest_dir = test_geometry_dir / str(new_id)
86
+ dest_dir.mkdir(parents=True, exist_ok=True)
87
+ shutil.copy(src=src, dst=dest_dir / "geometry.xyz")
88
+ assert len(list(test_geometry_dir.glob("**/*.xyz"))) == len(
89
+ new_test
90
+ ), f"Expected {len(new_test)} test geometry files in {public_dir}, found {len(list(test_geometry_dir.glob('**/*.xyz')))}"
91
+
92
+ # Create mock submission
93
+ sample_submission = pd.DataFrame(
94
+ {"id": new_test["id"], "formation_energy_ev_natom": 0.1779, "bandgap_energy_ev": 1.8892}
95
+ )
96
+ sample_submission.to_csv(public_dir / "sample_submission.csv", index=False)
97
+ assert len(sample_submission) == len(
98
+ new_test
99
+ ), f"Sample submission in {public_dir} should have the same number of rows as its test set"
100
+
101
+ return new_train
102
+
103
+
104
+ def prepare(raw: Path, public: Path, private: Path):
105
+ """
106
+ Splits the data in raw into public and private datasets with appropriate test/train splits.
107
+ Also creates a secondary validation split (public_val/private_val) from the training data.
108
+ """
109
+ # Define paths for the new validation set
110
+ public_val = public.parent / "public_val"
111
+ private_val = private.parent / "private_val"
112
+
113
+ # Extract only what we need
114
+ extract(raw / "train.zip", raw / "train")
115
+ extract(raw / "train.csv.zip", raw / "train.csv")
116
+ extract(raw / "test.zip", raw / "test")
117
+ extract(raw / "test.csv.zip", raw / "test.csv")
118
+
119
+ # Load initial data and geometry paths
120
+ initial_train_df = read_csv(raw / "train.csv/train.csv")
121
+ all_geometry_paths = sorted(glob.glob(str(raw / "train/train/**/*.xyz")))
122
+
123
+ # --- First Split: Create original public/private datasets ---
124
+ # This split is identical to the original script to ensure unchanged outputs.
125
+ train_df_after_first_split = _process_split(
126
+ source_df=initial_train_df,
127
+ test_size=0.1,
128
+ random_state=0,
129
+ source_geometry_paths=all_geometry_paths,
130
+ public_dir=public,
131
+ private_dir=private,
132
+ )
133
+
134
+ # --- Second Split: Create new validation datasets from the first split's train set ---
135
+ # The test_size is set to 1/9 of the incoming data (which is 90% of the original).
136
+ # This makes the new test_val set size equal to the original test set size (0.1 * T = 1/9 * 0.9 * T).
137
+ _process_split(
138
+ source_df=train_df_after_first_split,
139
+ test_size=1 / 9,
140
+ random_state=0,
141
+ source_geometry_paths=all_geometry_paths,
142
+ public_dir=public_val,
143
+ private_dir=private_val,
144
+ )