dslighting 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (352) hide show
  1. dslighting/__init__.py +1 -1
  2. dslighting/core/agent.py +78 -62
  3. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/METADATA +1 -1
  4. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/RECORD +352 -7
  5. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/top_level.txt +1 -0
  6. mlebench/README.md +39 -0
  7. mlebench/__init__.py +0 -0
  8. mlebench/cli.py +221 -0
  9. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/grade.py +161 -0
  10. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/mAP_evaluation.py +425 -0
  11. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare.py +483 -0
  12. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare_val.py +719 -0
  13. mlebench/competitions/AI4Code/grade.py +70 -0
  14. mlebench/competitions/AI4Code/prepare.py +84 -0
  15. mlebench/competitions/AI4Code/prepare_val.py +159 -0
  16. mlebench/competitions/__init__.py +0 -0
  17. mlebench/competitions/aerial-cactus-identification/grade.py +11 -0
  18. mlebench/competitions/aerial-cactus-identification/prepare.py +71 -0
  19. mlebench/competitions/aerial-cactus-identification/prepare_val.py +133 -0
  20. mlebench/competitions/alaska2-image-steganalysis/grade.py +136 -0
  21. mlebench/competitions/alaska2-image-steganalysis/prepare.py +88 -0
  22. mlebench/competitions/alaska2-image-steganalysis/prepare_val.py +148 -0
  23. mlebench/competitions/aptos2019-blindness-detection/grade.py +35 -0
  24. mlebench/competitions/aptos2019-blindness-detection/prepare.py +75 -0
  25. mlebench/competitions/aptos2019-blindness-detection/prepare_val.py +123 -0
  26. mlebench/competitions/bike-sharing-demand/__init__.py +0 -0
  27. mlebench/competitions/bike-sharing-demand/grade.py +55 -0
  28. mlebench/competitions/bike-sharing-demand/prepare.py +37 -0
  29. mlebench/competitions/billion-word-imputation/grade.py +37 -0
  30. mlebench/competitions/billion-word-imputation/prepare.py +107 -0
  31. mlebench/competitions/billion-word-imputation/prepare_val.py +179 -0
  32. mlebench/competitions/bms-molecular-translation/grade.py +40 -0
  33. mlebench/competitions/bms-molecular-translation/prepare.py +68 -0
  34. mlebench/competitions/bms-molecular-translation/prepare_val.py +131 -0
  35. mlebench/competitions/cassava-leaf-disease-classification/grade.py +12 -0
  36. mlebench/competitions/cassava-leaf-disease-classification/prepare.py +113 -0
  37. mlebench/competitions/cassava-leaf-disease-classification/prepare_val.py +186 -0
  38. mlebench/competitions/cdiscount-image-classification-challenge/grade.py +11 -0
  39. mlebench/competitions/cdiscount-image-classification-challenge/prepare.py +144 -0
  40. mlebench/competitions/cdiscount-image-classification-challenge/prepare_val.py +205 -0
  41. mlebench/competitions/chaii-hindi-and-tamil-question-answering/grade.py +67 -0
  42. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare.py +31 -0
  43. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare_val.py +94 -0
  44. mlebench/competitions/champs-scalar-coupling/grade.py +60 -0
  45. mlebench/competitions/champs-scalar-coupling/prepare.py +116 -0
  46. mlebench/competitions/champs-scalar-coupling/prepare_val.py +155 -0
  47. mlebench/competitions/conways-reverse-game-of-life-2020/__init__.py +0 -0
  48. mlebench/competitions/conways-reverse-game-of-life-2020/grade.py +40 -0
  49. mlebench/competitions/conways-reverse-game-of-life-2020/prepare.py +41 -0
  50. mlebench/competitions/demand-forecasting-kernels-only/__init__.py +0 -0
  51. mlebench/competitions/demand-forecasting-kernels-only/grade.py +66 -0
  52. mlebench/competitions/demand-forecasting-kernels-only/prepare.py +27 -0
  53. mlebench/competitions/demand_forecasting_kernels_only/__init__.py +0 -0
  54. mlebench/competitions/demand_forecasting_kernels_only/grade.py +66 -0
  55. mlebench/competitions/demand_forecasting_kernels_only/prepare.py +27 -0
  56. mlebench/competitions/denoising-dirty-documents/grade.py +44 -0
  57. mlebench/competitions/denoising-dirty-documents/prepare.py +134 -0
  58. mlebench/competitions/denoising-dirty-documents/prepare_val.py +178 -0
  59. mlebench/competitions/detecting-insults-in-social-commentary/grade.py +11 -0
  60. mlebench/competitions/detecting-insults-in-social-commentary/prepare.py +72 -0
  61. mlebench/competitions/detecting-insults-in-social-commentary/prepare_val.py +128 -0
  62. mlebench/competitions/dog-breed-identification/dogs.py +124 -0
  63. mlebench/competitions/dog-breed-identification/grade.py +42 -0
  64. mlebench/competitions/dog-breed-identification/prepare.py +55 -0
  65. mlebench/competitions/dog-breed-identification/prepare_val.py +104 -0
  66. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/grade.py +43 -0
  67. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare.py +70 -0
  68. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare_val.py +143 -0
  69. mlebench/competitions/ethanol-concentration/grade.py +23 -0
  70. mlebench/competitions/ethanol-concentration/prepare.py +90 -0
  71. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/grade.py +60 -0
  72. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare.py +41 -0
  73. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare_val.py +92 -0
  74. mlebench/competitions/feedback-prize-english-language-learning/__init__.py +0 -0
  75. mlebench/competitions/feedback-prize-english-language-learning/grade.py +60 -0
  76. mlebench/competitions/feedback-prize-english-language-learning/prepare.py +39 -0
  77. mlebench/competitions/freesound-audio-tagging-2019/grade.py +64 -0
  78. mlebench/competitions/freesound-audio-tagging-2019/prepare.py +94 -0
  79. mlebench/competitions/freesound-audio-tagging-2019/prepare_val.py +175 -0
  80. mlebench/competitions/freesound-audio-tagging-2019/vocabulary.py +83 -0
  81. mlebench/competitions/google-quest-challenge/classes.py +32 -0
  82. mlebench/competitions/google-quest-challenge/grade.py +45 -0
  83. mlebench/competitions/google-quest-challenge/prepare.py +58 -0
  84. mlebench/competitions/google-quest-challenge/prepare_val.py +120 -0
  85. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/grade.py +77 -0
  86. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare.py +155 -0
  87. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare_val.py +211 -0
  88. mlebench/competitions/h-and-m-personalized-fashion-recommendations/grade.py +42 -0
  89. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare.py +102 -0
  90. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare_val.py +132 -0
  91. mlebench/competitions/handwriting/grade.py +23 -0
  92. mlebench/competitions/handwriting/prepare.py +179 -0
  93. mlebench/competitions/herbarium-2020-fgvc7/grade.py +34 -0
  94. mlebench/competitions/herbarium-2020-fgvc7/prepare.py +251 -0
  95. mlebench/competitions/herbarium-2020-fgvc7/prepare_val.py +242 -0
  96. mlebench/competitions/herbarium-2021-fgvc8/grade.py +34 -0
  97. mlebench/competitions/herbarium-2021-fgvc8/prepare.py +251 -0
  98. mlebench/competitions/herbarium-2021-fgvc8/prepare_val.py +222 -0
  99. mlebench/competitions/herbarium-2022-fgvc9/grade.py +31 -0
  100. mlebench/competitions/herbarium-2022-fgvc9/prepare.py +233 -0
  101. mlebench/competitions/herbarium-2022-fgvc9/prepare_val.py +213 -0
  102. mlebench/competitions/histopathologic-cancer-detection/grade.py +12 -0
  103. mlebench/competitions/histopathologic-cancer-detection/prepare.py +59 -0
  104. mlebench/competitions/histopathologic-cancer-detection/prepare_val.py +131 -0
  105. mlebench/competitions/hms-harmful-brain-activity-classification/constants.py +9 -0
  106. mlebench/competitions/hms-harmful-brain-activity-classification/grade.py +43 -0
  107. mlebench/competitions/hms-harmful-brain-activity-classification/kaggle_metric_utilities.py +96 -0
  108. mlebench/competitions/hms-harmful-brain-activity-classification/kullback_leibler_divergence.py +118 -0
  109. mlebench/competitions/hms-harmful-brain-activity-classification/prepare.py +121 -0
  110. mlebench/competitions/hms-harmful-brain-activity-classification/prepare_val.py +190 -0
  111. mlebench/competitions/hotel-id-2021-fgvc8/grade.py +41 -0
  112. mlebench/competitions/hotel-id-2021-fgvc8/prepare.py +63 -0
  113. mlebench/competitions/hotel-id-2021-fgvc8/prepare_val.py +132 -0
  114. mlebench/competitions/hubmap-kidney-segmentation/grade.py +62 -0
  115. mlebench/competitions/hubmap-kidney-segmentation/prepare.py +108 -0
  116. mlebench/competitions/hubmap-kidney-segmentation/prepare_val.py +153 -0
  117. mlebench/competitions/icecube-neutrinos-in-deep-ice/grade.py +111 -0
  118. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare.py +127 -0
  119. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare_val.py +183 -0
  120. mlebench/competitions/ili/grade.py +60 -0
  121. mlebench/competitions/ili/prepare.py +99 -0
  122. mlebench/competitions/imet-2020-fgvc7/grade.py +54 -0
  123. mlebench/competitions/imet-2020-fgvc7/prepare.py +77 -0
  124. mlebench/competitions/imet-2020-fgvc7/prepare_val.py +157 -0
  125. mlebench/competitions/inaturalist-2019-fgvc6/grade.py +35 -0
  126. mlebench/competitions/inaturalist-2019-fgvc6/prepare.py +259 -0
  127. mlebench/competitions/inaturalist-2019-fgvc6/prepare_val.py +304 -0
  128. mlebench/competitions/instant-gratification/__init__.py +0 -0
  129. mlebench/competitions/instant-gratification/grade.py +55 -0
  130. mlebench/competitions/instant-gratification/prepare.py +25 -0
  131. mlebench/competitions/instant_gratification/__init__.py +0 -0
  132. mlebench/competitions/instant_gratification/grade.py +55 -0
  133. mlebench/competitions/instant_gratification/prepare.py +25 -0
  134. mlebench/competitions/invasive-species-monitoring/grade.py +11 -0
  135. mlebench/competitions/invasive-species-monitoring/prepare.py +97 -0
  136. mlebench/competitions/invasive-species-monitoring/prepare_val.py +164 -0
  137. mlebench/competitions/iwildcam-2019-fgvc6/grade.py +44 -0
  138. mlebench/competitions/iwildcam-2019-fgvc6/prepare.py +118 -0
  139. mlebench/competitions/iwildcam-2019-fgvc6/prepare_val.py +194 -0
  140. mlebench/competitions/iwildcam-2020-fgvc7/grade.py +11 -0
  141. mlebench/competitions/iwildcam-2020-fgvc7/prepare.py +164 -0
  142. mlebench/competitions/iwildcam-2020-fgvc7/prepare_val.py +245 -0
  143. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/classes.py +1 -0
  144. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/grade.py +54 -0
  145. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare.py +42 -0
  146. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare_val.py +88 -0
  147. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/grade.py +153 -0
  148. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare.py +36 -0
  149. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare_val.py +117 -0
  150. mlebench/competitions/kuzushiji-recognition/grade.py +58 -0
  151. mlebench/competitions/kuzushiji-recognition/kuzushiji_metric.py +118 -0
  152. mlebench/competitions/kuzushiji-recognition/prepare.py +92 -0
  153. mlebench/competitions/kuzushiji-recognition/prepare_val.py +149 -0
  154. mlebench/competitions/leaf-classification/classes.py +101 -0
  155. mlebench/competitions/leaf-classification/grade.py +44 -0
  156. mlebench/competitions/leaf-classification/prepare.py +60 -0
  157. mlebench/competitions/leaf-classification/prepare_val.py +116 -0
  158. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/grade.py +44 -0
  159. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare.py +51 -0
  160. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare_val.py +96 -0
  161. mlebench/competitions/liverpool-ion-switching/__init__.py +0 -0
  162. mlebench/competitions/liverpool-ion-switching/grade.py +52 -0
  163. mlebench/competitions/liverpool-ion-switching/prepare.py +27 -0
  164. mlebench/competitions/liverpool_ion_switching/__init__.py +0 -0
  165. mlebench/competitions/liverpool_ion_switching/grade.py +52 -0
  166. mlebench/competitions/liverpool_ion_switching/prepare.py +27 -0
  167. mlebench/competitions/lmsys-chatbot-arena/grade.py +63 -0
  168. mlebench/competitions/lmsys-chatbot-arena/prepare.py +52 -0
  169. mlebench/competitions/lmsys-chatbot-arena/prepare_val.py +115 -0
  170. mlebench/competitions/mcm_2024_c_test/grade.py +107 -0
  171. mlebench/competitions/mcm_2024_c_test/prepare.py +2 -0
  172. mlebench/competitions/ml2021spring-hw2/grade.py +11 -0
  173. mlebench/competitions/ml2021spring-hw2/prepare.py +58 -0
  174. mlebench/competitions/ml2021spring-hw2/prepare_val.py +135 -0
  175. mlebench/competitions/mlsp-2013-birds/grade.py +11 -0
  176. mlebench/competitions/mlsp-2013-birds/prepare.py +182 -0
  177. mlebench/competitions/mlsp-2013-birds/prepare_val.py +241 -0
  178. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/grade.py +11 -0
  179. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare.py +58 -0
  180. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare_val.py +120 -0
  181. mlebench/competitions/multi-modal-gesture-recognition/grade.py +58 -0
  182. mlebench/competitions/multi-modal-gesture-recognition/prepare.py +85 -0
  183. mlebench/competitions/multi-modal-gesture-recognition/prepare_val.py +139 -0
  184. mlebench/competitions/my-custom-task-01/prepare.py +2 -0
  185. mlebench/competitions/new-my-task-01/prepare.py +2 -0
  186. mlebench/competitions/new-my-task-03/grade.py +107 -0
  187. mlebench/competitions/new-my-task-03/prepare.py +2 -0
  188. mlebench/competitions/new-york-city-taxi-fare-prediction/grade.py +28 -0
  189. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare.py +44 -0
  190. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare_val.py +89 -0
  191. mlebench/competitions/nfl-player-contact-detection/grade.py +36 -0
  192. mlebench/competitions/nfl-player-contact-detection/prepare.py +101 -0
  193. mlebench/competitions/nfl-player-contact-detection/prepare_val.py +186 -0
  194. mlebench/competitions/nomad2018-predict-transparent-conductors/grade.py +47 -0
  195. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare.py +77 -0
  196. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare_val.py +144 -0
  197. mlebench/competitions/osic-pulmonary-fibrosis-progression/grade.py +74 -0
  198. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare.py +95 -0
  199. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare_val.py +167 -0
  200. mlebench/competitions/paddy-disease-classification/grade.py +35 -0
  201. mlebench/competitions/paddy-disease-classification/prepare.py +69 -0
  202. mlebench/competitions/paddy-disease-classification/prepare_val.py +122 -0
  203. mlebench/competitions/petfinder-pawpularity-score/grade.py +41 -0
  204. mlebench/competitions/petfinder-pawpularity-score/prepare.py +76 -0
  205. mlebench/competitions/petfinder-pawpularity-score/prepare_val.py +154 -0
  206. mlebench/competitions/plant-pathology-2020-fgvc7/grade.py +41 -0
  207. mlebench/competitions/plant-pathology-2020-fgvc7/prepare.py +74 -0
  208. mlebench/competitions/plant-pathology-2020-fgvc7/prepare_val.py +160 -0
  209. mlebench/competitions/plant-pathology-2021-fgvc8/grade.py +54 -0
  210. mlebench/competitions/plant-pathology-2021-fgvc8/prepare.py +65 -0
  211. mlebench/competitions/plant-pathology-2021-fgvc8/prepare_val.py +130 -0
  212. mlebench/competitions/plant-seedlings-classification/grade.py +39 -0
  213. mlebench/competitions/plant-seedlings-classification/prepare.py +91 -0
  214. mlebench/competitions/plant-seedlings-classification/prepare_val.py +158 -0
  215. mlebench/competitions/playground-series-s3e1/__init__.py +0 -0
  216. mlebench/competitions/playground-series-s3e1/grade.py +52 -0
  217. mlebench/competitions/playground-series-s3e1/prepare.py +25 -0
  218. mlebench/competitions/playground-series-s3e11/__init__.py +0 -0
  219. mlebench/competitions/playground-series-s3e11/grade.py +55 -0
  220. mlebench/competitions/playground-series-s3e11/prepare.py +25 -0
  221. mlebench/competitions/playground-series-s3e18/grade.py +39 -0
  222. mlebench/competitions/playground-series-s3e18/prepare.py +36 -0
  223. mlebench/competitions/playground-series-s3e18/prepare_val.py +89 -0
  224. mlebench/competitions/playground_series_s3e1/__init__.py +0 -0
  225. mlebench/competitions/playground_series_s3e1/grade.py +52 -0
  226. mlebench/competitions/playground_series_s3e1/prepare.py +25 -0
  227. mlebench/competitions/playground_series_s3e11/__init__.py +0 -0
  228. mlebench/competitions/playground_series_s3e11/grade.py +55 -0
  229. mlebench/competitions/playground_series_s3e11/prepare.py +25 -0
  230. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/grade.py +44 -0
  231. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare.py +68 -0
  232. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare_val.py +146 -0
  233. mlebench/competitions/random-acts-of-pizza/grade.py +14 -0
  234. mlebench/competitions/random-acts-of-pizza/prepare.py +80 -0
  235. mlebench/competitions/random-acts-of-pizza/prepare_val.py +144 -0
  236. mlebench/competitions/ranzcr-clip-catheter-line-classification/classes.py +11 -0
  237. mlebench/competitions/ranzcr-clip-catheter-line-classification/grade.py +31 -0
  238. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare.py +53 -0
  239. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare_val.py +113 -0
  240. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/grade.py +124 -0
  241. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare.py +219 -0
  242. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare_val.py +257 -0
  243. mlebench/competitions/rsna-breast-cancer-detection/grade.py +65 -0
  244. mlebench/competitions/rsna-breast-cancer-detection/prepare.py +141 -0
  245. mlebench/competitions/rsna-breast-cancer-detection/prepare_val.py +201 -0
  246. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/grade.py +13 -0
  247. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare.py +47 -0
  248. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare_val.py +97 -0
  249. mlebench/competitions/santander-customer-satisfaction/grade.py +10 -0
  250. mlebench/competitions/santander-customer-satisfaction/prepare.py +41 -0
  251. mlebench/competitions/sciencebench-001-clintox-nn/__init__.py +0 -0
  252. mlebench/competitions/sciencebench-001-clintox-nn/grade.py +56 -0
  253. mlebench/competitions/sciencebench-001-clintox-nn/prepare.py +75 -0
  254. mlebench/competitions/sciencebench-015-aai/grade.py +37 -0
  255. mlebench/competitions/sciencebench-015-aai/prepare.py +102 -0
  256. mlebench/competitions/sciencebench-051-brain-blood-qsar/grade.py +58 -0
  257. mlebench/competitions/sciencebench-051-brain-blood-qsar/prepare.py +69 -0
  258. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/grade.py +55 -0
  259. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/prepare.py +88 -0
  260. mlebench/competitions/see-click-predict-fix/__init__.py +0 -0
  261. mlebench/competitions/see-click-predict-fix/grade.py +66 -0
  262. mlebench/competitions/see-click-predict-fix/prepare.py +25 -0
  263. mlebench/competitions/see_click_predict_fix/__init__.py +0 -0
  264. mlebench/competitions/see_click_predict_fix/grade.py +66 -0
  265. mlebench/competitions/see_click_predict_fix/prepare.py +25 -0
  266. mlebench/competitions/seti-breakthrough-listen/grade.py +11 -0
  267. mlebench/competitions/seti-breakthrough-listen/prepare.py +71 -0
  268. mlebench/competitions/seti-breakthrough-listen/prepare_val.py +159 -0
  269. mlebench/competitions/siim-covid19-detection/grade.py +194 -0
  270. mlebench/competitions/siim-covid19-detection/prepare.py +123 -0
  271. mlebench/competitions/siim-covid19-detection/prepare_val.py +164 -0
  272. mlebench/competitions/siim-isic-melanoma-classification/grade.py +11 -0
  273. mlebench/competitions/siim-isic-melanoma-classification/prepare.py +127 -0
  274. mlebench/competitions/siim-isic-melanoma-classification/prepare_val.py +158 -0
  275. mlebench/competitions/smartphone-decimeter-2022/grade.py +55 -0
  276. mlebench/competitions/smartphone-decimeter-2022/notebook.py +86 -0
  277. mlebench/competitions/smartphone-decimeter-2022/prepare.py +143 -0
  278. mlebench/competitions/smartphone-decimeter-2022/prepare_val.py +199 -0
  279. mlebench/competitions/spaceship-titanic/grade.py +11 -0
  280. mlebench/competitions/spaceship-titanic/prepare.py +23 -0
  281. mlebench/competitions/spaceship-titanic/prepare_val.py +61 -0
  282. mlebench/competitions/spooky-author-identification/classes.py +1 -0
  283. mlebench/competitions/spooky-author-identification/grade.py +38 -0
  284. mlebench/competitions/spooky-author-identification/prepare.py +40 -0
  285. mlebench/competitions/spooky-author-identification/prepare_val.py +78 -0
  286. mlebench/competitions/stanford-covid-vaccine/grade.py +65 -0
  287. mlebench/competitions/stanford-covid-vaccine/prepare.py +129 -0
  288. mlebench/competitions/stanford-covid-vaccine/prepare_val.py +199 -0
  289. mlebench/competitions/statoil-iceberg-classifier-challenge/grade.py +41 -0
  290. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare.py +105 -0
  291. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare_val.py +157 -0
  292. mlebench/competitions/tabular-playground-series-dec-2021/grade.py +11 -0
  293. mlebench/competitions/tabular-playground-series-dec-2021/prepare.py +39 -0
  294. mlebench/competitions/tabular-playground-series-dec-2021/prepare_val.py +99 -0
  295. mlebench/competitions/tabular-playground-series-may-2022/grade.py +9 -0
  296. mlebench/competitions/tabular-playground-series-may-2022/prepare.py +56 -0
  297. mlebench/competitions/tabular-playground-series-may-2022/prepare_val.py +116 -0
  298. mlebench/competitions/tensorflow-speech-recognition-challenge/grade.py +11 -0
  299. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare.py +90 -0
  300. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare_val.py +148 -0
  301. mlebench/competitions/tensorflow2-question-answering/grade.py +122 -0
  302. mlebench/competitions/tensorflow2-question-answering/prepare.py +122 -0
  303. mlebench/competitions/tensorflow2-question-answering/prepare_val.py +187 -0
  304. mlebench/competitions/text-normalization-challenge-english-language/grade.py +49 -0
  305. mlebench/competitions/text-normalization-challenge-english-language/prepare.py +115 -0
  306. mlebench/competitions/text-normalization-challenge-english-language/prepare_val.py +213 -0
  307. mlebench/competitions/text-normalization-challenge-russian-language/grade.py +49 -0
  308. mlebench/competitions/text-normalization-challenge-russian-language/prepare.py +113 -0
  309. mlebench/competitions/text-normalization-challenge-russian-language/prepare_val.py +165 -0
  310. mlebench/competitions/tgs-salt-identification-challenge/grade.py +144 -0
  311. mlebench/competitions/tgs-salt-identification-challenge/prepare.py +158 -0
  312. mlebench/competitions/tgs-salt-identification-challenge/prepare_val.py +166 -0
  313. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/grade.py +11 -0
  314. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare.py +95 -0
  315. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare_val.py +141 -0
  316. mlebench/competitions/tmdb-box-office-prediction/__init__.py +0 -0
  317. mlebench/competitions/tmdb-box-office-prediction/grade.py +55 -0
  318. mlebench/competitions/tmdb-box-office-prediction/prepare.py +35 -0
  319. mlebench/competitions/tweet-sentiment-extraction/grade.py +67 -0
  320. mlebench/competitions/tweet-sentiment-extraction/prepare.py +36 -0
  321. mlebench/competitions/tweet-sentiment-extraction/prepare_val.py +106 -0
  322. mlebench/competitions/us-patent-phrase-to-phrase-matching/grade.py +31 -0
  323. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare.py +33 -0
  324. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare_val.py +71 -0
  325. mlebench/competitions/utils.py +266 -0
  326. mlebench/competitions/uw-madison-gi-tract-image-segmentation/grade.py +158 -0
  327. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare.py +139 -0
  328. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare_val.py +193 -0
  329. mlebench/competitions/ventilator-pressure-prediction/__init__.py +0 -0
  330. mlebench/competitions/ventilator-pressure-prediction/grade.py +52 -0
  331. mlebench/competitions/ventilator-pressure-prediction/prepare.py +27 -0
  332. mlebench/competitions/ventilator-pressure-prediction/prepare_val.py +142 -0
  333. mlebench/competitions/ventilator_pressure_prediction/__init__.py +0 -0
  334. mlebench/competitions/ventilator_pressure_prediction/grade.py +52 -0
  335. mlebench/competitions/ventilator_pressure_prediction/prepare.py +27 -0
  336. mlebench/competitions/vesuvius-challenge-ink-detection/grade.py +97 -0
  337. mlebench/competitions/vesuvius-challenge-ink-detection/prepare.py +122 -0
  338. mlebench/competitions/vesuvius-challenge-ink-detection/prepare_val.py +170 -0
  339. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/grade.py +220 -0
  340. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare.py +129 -0
  341. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare_val.py +204 -0
  342. mlebench/competitions/whale-categorization-playground/grade.py +41 -0
  343. mlebench/competitions/whale-categorization-playground/prepare.py +103 -0
  344. mlebench/competitions/whale-categorization-playground/prepare_val.py +196 -0
  345. mlebench/data.py +420 -0
  346. mlebench/grade.py +209 -0
  347. mlebench/grade_helpers.py +235 -0
  348. mlebench/metrics.py +75 -0
  349. mlebench/registry.py +332 -0
  350. mlebench/utils.py +346 -0
  351. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/WHEEL +0 -0
  352. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,259 @@
1
+ import json
2
+ import random
3
+ import shutil
4
+ import tarfile
5
+ from pathlib import Path
6
+
7
+ import pandas as pd
8
+ from sklearn.model_selection import train_test_split
9
+ from tqdm import tqdm
10
+
11
+ from mlebench.utils import get_logger
12
+
13
+ logger = get_logger(__name__)
14
+
15
+
16
+ def add_to_tar(src: Path, out: Path):
17
+
18
+ assert src.exists(), f"Source directory `{src}` does not exist."
19
+ assert src.is_dir(), f"Expected a directory, but got `{src}`."
20
+
21
+ tqdm_desc = f"Taring {src.name} to {out.name}"
22
+ file_paths = [path for path in src.rglob("*") if path.is_file()]
23
+ total_files = len(file_paths)
24
+
25
+ with tarfile.open(out, "w") as tar:
26
+ for file_path in tqdm(file_paths, desc=tqdm_desc, unit="file", total=total_files):
27
+ tar.add(file_path, arcname=file_path.relative_to(src))
28
+
29
+
30
+ def prepare(raw: Path, public: Path, private: Path):
31
+
32
+ dev_mode = False
33
+ image_count = 2 if dev_mode else float("inf") # We copy over 2 images per category for dev mode
34
+
35
+ # Extract train_val2019.tar.gz which contains images
36
+ train_tar_path = raw / "train_val2019.tar.gz"
37
+ train_extract_path = raw
38
+ if not (raw / "train_val2019").exists():
39
+ shutil.unpack_archive(train_tar_path, train_extract_path)
40
+
41
+ # Create train, test from train split
42
+ json_path = raw / "train2019.json"
43
+ with open(json_path, "r", encoding="utf-8") as f:
44
+ old_train_metadata = json.load(f)
45
+
46
+ # Organize data by category so that we can split per-category later
47
+ annotation_image_metadata_by_category = {} # We'll collect both `annotations` and `images` here
48
+ for annotation_info, image_info in list(
49
+ zip(old_train_metadata["annotations"], old_train_metadata["images"])
50
+ ):
51
+ assert (
52
+ annotation_info["image_id"] == image_info["id"]
53
+ ), f"Mismatching image_id in annotation and image: {annotation_info['image_id']} vs {image_info['id']}"
54
+ category_id = annotation_info["category_id"]
55
+ if category_id not in annotation_image_metadata_by_category:
56
+ annotation_image_metadata_by_category[category_id] = []
57
+ annotation_image_metadata_by_category[category_id].append(
58
+ {
59
+ "annotation": annotation_info,
60
+ "image": image_info,
61
+ }
62
+ )
63
+
64
+ # Split train/test
65
+ train_sample_count = 0 # Useful for tqdm later
66
+ train_annotation_image_metadata_by_category = {}
67
+ test_annotation_image_metadata_by_category = {}
68
+
69
+ for category_id, annotation_image_metadata in tqdm(
70
+ annotation_image_metadata_by_category.items(), desc="Assigning train/test splits"
71
+ ):
72
+ # Create split by "category" (class)
73
+ # Original train+val has 268,243 images, test has 35,400 images, 0.12 ratio
74
+ test_size = 0.12
75
+ n_samples = len(annotation_image_metadata)
76
+ if n_samples == 1:
77
+ # If only one sample, put it in train
78
+ train_annotations_images = annotation_image_metadata
79
+ test_annotations_images = []
80
+ elif n_samples < 5: # Minimum 5 samples to ensure at least 1 in test
81
+ num_test_samples = max(1, int(n_samples * test_size))
82
+ train_annotations_images = annotation_image_metadata[:-num_test_samples]
83
+ test_annotations_images = annotation_image_metadata[-num_test_samples:]
84
+ else:
85
+ train_annotations_images, test_annotations_images = train_test_split(
86
+ annotation_image_metadata, test_size=test_size, random_state=0
87
+ )
88
+
89
+ train_annotation_image_metadata_by_category[category_id] = train_annotations_images
90
+ test_annotation_image_metadata_by_category[category_id] = test_annotations_images
91
+ train_sample_count += len(train_annotations_images)
92
+
93
+ # Create new train2019.json
94
+ new_train_metadata = (
95
+ old_train_metadata.copy()
96
+ ) # Keep 'info', 'categories', 'licenses' unchanged
97
+ new_train_metadata.update(
98
+ {
99
+ "annotations": [],
100
+ "images": [],
101
+ }
102
+ )
103
+ for category_id, annotation_image_metadata in tqdm(
104
+ train_annotation_image_metadata_by_category.items(),
105
+ desc="Creating new train2019.json",
106
+ total=len(train_annotation_image_metadata_by_category),
107
+ ):
108
+ for annotation_image in annotation_image_metadata:
109
+ new_annotation = annotation_image["annotation"].copy()
110
+ new_train_metadata["annotations"].append(new_annotation)
111
+ new_image = annotation_image["image"].copy()
112
+ new_train_metadata["images"].append(new_image)
113
+
114
+ with open(public / "train2019.json", "w") as f:
115
+ json.dump(new_train_metadata, f, indent=4, sort_keys=True)
116
+
117
+ # Copy over val2019.json
118
+ shutil.copy(raw / "val2019.json", public / "val2019.json")
119
+ logger.info(f"Copied {raw / 'val2019.json'} to {public / 'val2019.json'}")
120
+
121
+ # Create new test2019.json
122
+ new_to_old_file_name = {}
123
+ new_test_metadata = old_train_metadata.copy()
124
+ del new_test_metadata["categories"]
125
+ new_test_metadata.update(
126
+ {
127
+ "annotations": [],
128
+ "images": [],
129
+ }
130
+ )
131
+ # Flatten and shuffle test set so that we don't have all the same catedgories in a row
132
+ test_annotations_images = [
133
+ item for sublist in test_annotation_image_metadata_by_category.values() for item in sublist
134
+ ]
135
+ random.Random(0).shuffle(test_annotations_images)
136
+ for idx, annotation_image in tqdm(
137
+ enumerate(test_annotations_images),
138
+ desc="Creating new test2019.json",
139
+ total=len(test_annotations_images),
140
+ ):
141
+
142
+ new_annotation = annotation_image["annotation"].copy()
143
+ new_test_metadata["annotations"].append(new_annotation)
144
+
145
+ new_image = annotation_image["image"].copy()
146
+ old_file_name = new_image["file_name"]
147
+ # go from e.g. "train_val2019/Plants/400/d1322d13ccd856eb4236c8b888546c79.jpg" to "test2019/d1322d13ccd856eb4236c8b888546c79.jpg"
148
+ new_file_name = "test2019/" + old_file_name.split("/")[-1]
149
+ # keep track of things so we know what to copy later
150
+ new_to_old_file_name[new_file_name] = old_file_name
151
+ new_image["file_name"] = new_file_name
152
+ new_test_metadata["images"].append(new_image)
153
+ with open(public / "test2019.json", "w") as f:
154
+ # The public test data, of course, doesn't have annotations
155
+ public_new_test = new_test_metadata.copy()
156
+ del public_new_test["annotations"]
157
+ assert public_new_test.keys() == {
158
+ "images",
159
+ "info",
160
+ "licenses",
161
+ }, f"Public test metadata keys should be 'images', 'info', 'licenses', but found {public_new_test.keys()}"
162
+ json.dump(public_new_test, f, indent=4, sort_keys=True)
163
+
164
+ (public / "train_val2019").mkdir(parents=True, exist_ok=True)
165
+ (public / "test2019").mkdir(parents=True, exist_ok=True)
166
+
167
+ # Save private test answers
168
+ answers_rows = []
169
+ for image_info, annotation_info in zip(
170
+ new_test_metadata["images"], new_test_metadata["annotations"]
171
+ ):
172
+ assert (
173
+ image_info["id"] == annotation_info["image_id"]
174
+ ), f"Mismatching image_id in image and annotation: {image_info['id']} vs {annotation_info['image_id']}"
175
+ answers_rows.append(
176
+ {
177
+ "id": image_info["id"],
178
+ "predicted": annotation_info["category_id"],
179
+ }
180
+ )
181
+ answers_df = pd.DataFrame(answers_rows)
182
+ answers_df.to_csv(private / "answers.csv", index=False)
183
+
184
+ # Create new sample submission based on answers_df
185
+ sample_df = answers_df.copy()
186
+ sample_df["predicted"] = [random.Random(42).randint(0, 1009) for _ in range(len(sample_df))]
187
+ sample_df.to_csv(public / "kaggle_sample_submission.csv", index=False)
188
+
189
+ assert len(answers_df) == len(
190
+ new_test_metadata["images"]
191
+ ), f"Expected {len(new_test_metadata['images'])} rows in answers, but found {len(answers_df)}"
192
+ assert len(sample_df) == len(
193
+ answers_df
194
+ ), f"Expected {len(answers_df)} rows in sample submission, but found {len(sample_df)}"
195
+ assert answers_df["id"].equals(
196
+ sample_df["id"]
197
+ ), "Mismatched 'id' columns between answers and sample submission"
198
+
199
+ # Copy train images
200
+ train_images_copied = 0
201
+ for category_id, annotation_image_metadata in tqdm(
202
+ train_annotation_image_metadata_by_category.items(),
203
+ desc="Copying train images grouped by category",
204
+ ):
205
+ for idx, annotation_image in enumerate(annotation_image_metadata):
206
+ if dev_mode and idx >= image_count:
207
+ break
208
+ old_path = raw / annotation_image["image"]["file_name"]
209
+ new_path = public / annotation_image["image"]["file_name"]
210
+ new_path.parent.mkdir(parents=True, exist_ok=True)
211
+ shutil.copy(old_path, new_path)
212
+ train_images_copied += 1
213
+
214
+ # Copy test images
215
+ test_images_copied = 0
216
+ for image_info in tqdm(new_test_metadata["images"], desc="Copying test images"):
217
+ if dev_mode and test_images_copied >= image_count:
218
+ break
219
+ old_path = raw / new_to_old_file_name[image_info["file_name"]]
220
+ new_path = public / image_info["file_name"]
221
+ new_path.parent.mkdir(parents=True, exist_ok=True)
222
+ shutil.copy(old_path, new_path)
223
+ test_images_copied += 1
224
+
225
+ logger.info(f"Copied {train_images_copied} train images and {test_images_copied} test images")
226
+
227
+ if not dev_mode:
228
+ assert len(list((public / "train_val2019").glob("**/*.jpg"))) == len(
229
+ new_train_metadata["images"]
230
+ ), f"Mismatching number of images in train_images, got {len(list((public / 'train_val2019').glob('**/*.jpg')))}"
231
+
232
+ assert len(new_train_metadata["annotations"]) == len(
233
+ new_train_metadata["images"]
234
+ ), f"Mismatching number of annotations in train_metadata, got {len(new_train_metadata['annotations'])}"
235
+
236
+ if not dev_mode:
237
+ assert len(list((public / "test2019").glob("**/*.jpg"))) == len(
238
+ new_test_metadata["images"]
239
+ ), f"Mismatching number of images in test_images, got {len(list((public / 'test2019').glob('**/*.jpg')))}"
240
+
241
+ assert len(new_test_metadata["annotations"]) == len(
242
+ new_test_metadata["images"]
243
+ ), f"Mismatching number of annotations in test_metadata, got {len(new_test_metadata['annotations'])}"
244
+
245
+ assert len(new_train_metadata["annotations"]) + len(
246
+ new_test_metadata["annotations"]
247
+ ) == len(old_train_metadata["annotations"]), (
248
+ f"Expected {len(old_train_metadata['annotations'])} annotations in total, but found"
249
+ f"{len(new_train_metadata['annotations'])} in train and {len(new_test_metadata['annotations'])} in test"
250
+ )
251
+
252
+ # Re-compress images
253
+ add_to_tar(
254
+ src=public / "test2019", out=public / "test2019.tar.gz"
255
+ ) # Add to tar but don't actually compress with gzip to save time
256
+ add_to_tar(src=public / "train_val2019", out=public / "train_val2019.tar.gz")
257
+ # Remove uncompressed directories
258
+ shutil.rmtree(public / "test2019")
259
+ shutil.rmtree(public / "train_val2019")
@@ -0,0 +1,304 @@
1
+ import json
2
+ import random
3
+ import shutil
4
+ import tarfile
5
+ from pathlib import Path
6
+ from typing import Dict, List
7
+
8
+ import pandas as pd
9
+ from sklearn.model_selection import train_test_split
10
+ from tqdm import tqdm
11
+
12
+ from mlebench.utils import get_logger
13
+
14
+ logger = get_logger(__name__)
15
+
16
+
17
+ def add_to_tar(src: Path, out: Path):
18
+
19
+ assert src.exists(), f"Source directory `{src}` does not exist."
20
+ assert src.is_dir(), f"Expected a directory, but got `{src}`."
21
+
22
+ tqdm_desc = f"Taring {src.name} to {out.name}"
23
+ file_paths = [path for path in src.rglob("*") if path.is_file()]
24
+ total_files = len(file_paths)
25
+
26
+ with tarfile.open(out, "w") as tar:
27
+ for file_path in tqdm(file_paths, desc=tqdm_desc, unit="file", total=total_files):
28
+ tar.add(file_path, arcname=file_path.relative_to(src))
29
+
30
+
31
+ def _split_by_category(
32
+ data_by_category: Dict, test_size: float, random_state: int
33
+ ) -> tuple[Dict, Dict]:
34
+ """Splits data for each category into train and test sets using the original script's logic."""
35
+ train_split_by_category = {}
36
+ test_split_by_category = {}
37
+
38
+ for category_id, annotation_image_metadata in tqdm(
39
+ data_by_category.items(),
40
+ desc=f"Assigning train/test splits (test_size={test_size:.3f})",
41
+ ):
42
+ n_samples = len(annotation_image_metadata)
43
+ if n_samples == 1:
44
+ # If only one sample, put it in train
45
+ train_annotations_images = annotation_image_metadata
46
+ test_annotations_images = []
47
+ elif n_samples < 5: # Minimum 5 samples to ensure at least 1 in test
48
+ num_test_samples = max(1, int(n_samples * test_size))
49
+ train_annotations_images = annotation_image_metadata[:-num_test_samples]
50
+ test_annotations_images = annotation_image_metadata[-num_test_samples:]
51
+ else:
52
+ train_annotations_images, test_annotations_images = train_test_split(
53
+ annotation_image_metadata, test_size=test_size, random_state=random_state
54
+ )
55
+
56
+ train_split_by_category[category_id] = train_annotations_images
57
+ test_split_by_category[category_id] = test_annotations_images
58
+
59
+ return train_split_by_category, test_split_by_category
60
+
61
+
62
+ def _generate_split_files(
63
+ train_annotation_image_metadata_by_category: Dict,
64
+ test_annotation_image_metadata_by_category: Dict,
65
+ old_train_metadata: Dict,
66
+ raw_path: Path,
67
+ public_path: Path,
68
+ private_path: Path,
69
+ dev_mode: bool,
70
+ image_count: int,
71
+ ):
72
+ """
73
+ Processes a given train/test split and saves all corresponding files
74
+ (metadata, images, private answers, etc.) to the specified output directories.
75
+ """
76
+ public_path.mkdir(parents=True, exist_ok=True)
77
+ private_path.mkdir(parents=True, exist_ok=True)
78
+
79
+ # Create new train2019.json
80
+ new_train_metadata = (
81
+ old_train_metadata.copy()
82
+ ) # Keep 'info', 'categories', 'licenses' unchanged
83
+ new_train_metadata.update(
84
+ {
85
+ "annotations": [],
86
+ "images": [],
87
+ }
88
+ )
89
+ for category_id, annotation_image_metadata in tqdm(
90
+ train_annotation_image_metadata_by_category.items(),
91
+ desc=f"[{public_path.name}] Creating new train2019.json",
92
+ total=len(train_annotation_image_metadata_by_category),
93
+ ):
94
+ for annotation_image in annotation_image_metadata:
95
+ new_annotation = annotation_image["annotation"].copy()
96
+ new_train_metadata["annotations"].append(new_annotation)
97
+ new_image = annotation_image["image"].copy()
98
+ new_train_metadata["images"].append(new_image)
99
+
100
+ with open(public_path / "train2019.json", "w") as f:
101
+ json.dump(new_train_metadata, f, indent=4, sort_keys=True)
102
+
103
+ # Copy over val2019.json
104
+ shutil.copy(raw_path / "val2019.json", public_path / "val2019.json")
105
+
106
+ # Create new test2019.json
107
+ new_to_old_file_name = {}
108
+ new_test_metadata = old_train_metadata.copy()
109
+ del new_test_metadata["categories"]
110
+ new_test_metadata.update(
111
+ {
112
+ "annotations": [],
113
+ "images": [],
114
+ }
115
+ )
116
+ # Flatten and shuffle test set so that we don't have all the same categories in a row
117
+ test_annotations_images = [
118
+ item
119
+ for sublist in test_annotation_image_metadata_by_category.values()
120
+ for item in sublist
121
+ ]
122
+ random.Random(0).shuffle(test_annotations_images)
123
+ for idx, annotation_image in tqdm(
124
+ enumerate(test_annotations_images),
125
+ desc=f"[{public_path.name}] Creating new test2019.json",
126
+ total=len(test_annotations_images),
127
+ ):
128
+ new_annotation = annotation_image["annotation"].copy()
129
+ new_test_metadata["annotations"].append(new_annotation)
130
+
131
+ new_image = annotation_image["image"].copy()
132
+ old_file_name = new_image["file_name"]
133
+ # go from e.g. "train_val2019/Plants/400/d1322d13ccd856eb4236c8b888546c79.jpg" to "test2019/d1322d13ccd856eb4236c8b888546c79.jpg"
134
+ new_file_name = "test2019/" + old_file_name.split("/")[-1]
135
+ # keep track of things so we know what to copy later
136
+ new_to_old_file_name[new_file_name] = old_file_name
137
+ new_image["file_name"] = new_file_name
138
+ new_test_metadata["images"].append(new_image)
139
+
140
+ with open(public_path / "test2019.json", "w") as f:
141
+ # The public test data, of course, doesn't have annotations
142
+ public_new_test = new_test_metadata.copy()
143
+ del public_new_test["annotations"]
144
+ assert public_new_test.keys() == {
145
+ "images",
146
+ "info",
147
+ "licenses",
148
+ }, f"Public test metadata keys should be 'images', 'info', 'licenses', but found {public_new_test.keys()}"
149
+ json.dump(public_new_test, f, indent=4, sort_keys=True)
150
+
151
+ (public_path / "train_val2019").mkdir(parents=True, exist_ok=True)
152
+ (public_path / "test2019").mkdir(parents=True, exist_ok=True)
153
+
154
+ # Save private test answers
155
+ answers_rows = []
156
+ for image_info, annotation_info in zip(
157
+ new_test_metadata["images"], new_test_metadata["annotations"]
158
+ ):
159
+ assert (
160
+ image_info["id"] == annotation_info["image_id"]
161
+ ), f"Mismatching image_id in image and annotation: {image_info['id']} vs {annotation_info['image_id']}"
162
+ answers_rows.append(
163
+ {
164
+ "id": image_info["id"],
165
+ "predicted": annotation_info["category_id"],
166
+ }
167
+ )
168
+ answers_df = pd.DataFrame(answers_rows)
169
+ answers_df.to_csv(private_path / "answers.csv", index=False)
170
+
171
+ # Create new sample submission based on answers_df
172
+ sample_df = answers_df.copy()
173
+ sample_df["predicted"] = [random.Random(42).randint(0, 1009) for _ in range(len(sample_df))]
174
+ sample_df.to_csv(public_path / "kaggle_sample_submission.csv", index=False)
175
+
176
+ # Copy train images
177
+ for annotation_image_metadata in tqdm(
178
+ train_annotation_image_metadata_by_category.values(),
179
+ desc=f"[{public_path.name}] Copying train images",
180
+ ):
181
+ for idx, annotation_image in enumerate(annotation_image_metadata):
182
+ if dev_mode and idx >= image_count:
183
+ break
184
+ old_path = raw_path / annotation_image["image"]["file_name"]
185
+ new_path = public_path / annotation_image["image"]["file_name"]
186
+ new_path.parent.mkdir(parents=True, exist_ok=True)
187
+ shutil.copy(old_path, new_path)
188
+
189
+ # Copy test images
190
+ for image_info in tqdm(
191
+ new_test_metadata["images"], desc=f"[{public_path.name}] Copying test images"
192
+ ):
193
+ if dev_mode and len(new_to_old_file_name) >= image_count:
194
+ break
195
+ old_path = raw_path / new_to_old_file_name[image_info["file_name"]]
196
+ new_path = public_path / image_info["file_name"]
197
+ new_path.parent.mkdir(parents=True, exist_ok=True)
198
+ shutil.copy(old_path, new_path)
199
+
200
+ # Re-compress images
201
+ add_to_tar(
202
+ src=public_path / "test2019", out=public_path / "test2019.tar.gz"
203
+ ) # Add to tar but don't actually compress with gzip to save time
204
+ add_to_tar(src=public_path / "train_val2019", out=public_path / "train_val2019.tar.gz")
205
+ # Remove uncompressed directories
206
+ shutil.rmtree(public_path / "test2019")
207
+ shutil.rmtree(public_path / "train_val2019")
208
+ logger.info(f"Finished generating files for {public_path.name}")
209
+
210
+
211
+ def prepare(raw: Path, public: Path, private: Path):
212
+
213
+ dev_mode = False
214
+ image_count = 2 if dev_mode else float("inf") # We copy over 2 images per category for dev mode
215
+
216
+ # Extract train_val2019.tar.gz which contains images
217
+ train_tar_path = raw / "train_val2019.tar.gz"
218
+ train_extract_path = raw
219
+ if not (raw / "train_val2019").exists():
220
+ logger.info("Extracting raw image data...")
221
+ shutil.unpack_archive(train_tar_path, train_extract_path)
222
+
223
+ # Create train, test from train split
224
+ json_path = raw / "train2019.json"
225
+ with open(json_path, "r", encoding="utf-8") as f:
226
+ old_train_metadata = json.load(f)
227
+
228
+ # Organize data by category so that we can split per-category later
229
+ annotation_image_metadata_by_category = {} # We'll collect both `annotations` and `images` here
230
+ for annotation_info, image_info in list(
231
+ zip(old_train_metadata["annotations"], old_train_metadata["images"])
232
+ ):
233
+ assert (
234
+ annotation_info["image_id"] == image_info["id"]
235
+ ), f"Mismatching image_id in annotation and image: {annotation_info['image_id']} vs {image_info['id']}"
236
+ category_id = annotation_info["category_id"]
237
+ if category_id not in annotation_image_metadata_by_category:
238
+ annotation_image_metadata_by_category[category_id] = []
239
+ annotation_image_metadata_by_category[category_id].append(
240
+ {
241
+ "annotation": annotation_info,
242
+ "image": image_info,
243
+ }
244
+ )
245
+
246
+ # --- 1. Original Data Split (Train/Test) ---
247
+ logger.info("--- Generating Original Train/Test Split ---")
248
+ # Original train+val has 268,243 images, test has 35,400 images, ~0.12 ratio
249
+ original_test_size = 0.12
250
+ (
251
+ original_train_split,
252
+ original_test_split,
253
+ ) = _split_by_category(
254
+ annotation_image_metadata_by_category,
255
+ test_size=original_test_size,
256
+ random_state=0,
257
+ )
258
+
259
+ _generate_split_files(
260
+ original_train_split,
261
+ original_test_split,
262
+ old_train_metadata,
263
+ raw,
264
+ public,
265
+ private,
266
+ dev_mode,
267
+ image_count,
268
+ )
269
+ logger.info(f"Original split saved to {public.name} and {private.name}")
270
+
271
+ # --- 2. New Validation Data Split (Train/Val) ---
272
+ logger.info("--- Generating New Train/Validation Split ---")
273
+ # Define new output directories
274
+ public_val = public.parent / "public_val"
275
+ private_val = private.parent / "private_val"
276
+
277
+ # We want the new validation set ('test_val') to have the same size as the
278
+ # original test set. We are splitting the *original_train_split* to get it.
279
+ # test_val_size = new_test_size * train_size
280
+ # We want: test_val_size ≈ test_size
281
+ # So: new_test_size * (N * (1-0.12)) ≈ N * 0.12
282
+ # new_test_size ≈ 0.12 / (1 - 0.12)
283
+ val_split_test_size = original_test_size / (1.0 - original_test_size)
284
+
285
+ (
286
+ validation_train_split,
287
+ validation_test_split,
288
+ ) = _split_by_category(
289
+ original_train_split, # Split the TRAIN set from the first split
290
+ test_size=val_split_test_size,
291
+ random_state=0, # Use same random state for consistency
292
+ )
293
+
294
+ _generate_split_files(
295
+ validation_train_split,
296
+ validation_test_split,
297
+ old_train_metadata,
298
+ raw,
299
+ public_val,
300
+ private_val,
301
+ dev_mode,
302
+ image_count,
303
+ )
304
+ logger.info(f"Validation split saved to {public_val.name} and {private_val.name}")
@@ -0,0 +1,55 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.metrics import roc_auc_score
4
+
5
+ from mlebench.grade_helpers import InvalidSubmissionError
6
+
7
+
8
+ def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame):
9
+ """Prepare submission and answers for AUC-ROC calculation."""
10
+ if len(submission) != len(answers):
11
+ raise InvalidSubmissionError(
12
+ f"Submission length ({len(submission)}) != answers length ({len(answers)})"
13
+ )
14
+
15
+ if "id" not in submission.columns:
16
+ raise InvalidSubmissionError("Submission must have an 'id' column")
17
+
18
+ if "target" not in submission.columns:
19
+ raise InvalidSubmissionError("Submission must have a 'target' column")
20
+
21
+ # Sort by id
22
+ submission = submission.sort_values("id").reset_index(drop=True)
23
+ answers = answers.sort_values("id").reset_index(drop=True)
24
+
25
+ # Check id alignment
26
+ if not (submission["id"] == answers["id"]).all():
27
+ raise InvalidSubmissionError("Submission and answers id mismatch")
28
+
29
+ # Extract predictions and true values
30
+ y_pred = submission["target"].values
31
+ y_true = answers["target"].values
32
+
33
+ # Validate predictions
34
+ try:
35
+ y_pred = y_pred.astype(float)
36
+ except (ValueError, TypeError):
37
+ raise InvalidSubmissionError("Predictions must be numeric")
38
+
39
+ if np.any(np.isnan(y_pred)):
40
+ raise InvalidSubmissionError("Predictions cannot contain NaN values")
41
+
42
+ if np.any((y_pred < 0) | (y_pred > 1)):
43
+ raise InvalidSubmissionError("Predictions must be probabilities between 0 and 1")
44
+
45
+ return y_true, y_pred
46
+
47
+
48
+ def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
49
+ """Calculate AUC-ROC score."""
50
+ y_true, y_pred = prepare_for_metric(submission, answers)
51
+
52
+ # AUC-ROC
53
+ auc = roc_auc_score(y_true, y_pred)
54
+
55
+ return auc
@@ -0,0 +1,25 @@
1
+ from pathlib import Path
2
+ import pandas as pd
3
+
4
+
5
+ def prepare(raw: Path, public: Path, private: Path):
6
+ """
7
+ Prepare instant-gratification dataset.
8
+ """
9
+ # Read data
10
+ train = pd.read_csv(raw / "train.csv")
11
+ test = pd.read_csv(raw / "test.csv")
12
+ sample_submission = pd.read_csv(raw / "sample_submission.csv")
13
+ test_answer = pd.read_csv(raw / "test_answer.csv")
14
+
15
+ # Public files (visible to agents)
16
+ train.to_csv(public / "train.csv", index=False)
17
+ test.to_csv(public / "test.csv", index=False)
18
+ sample_submission.to_csv(public / "sample_submission.csv", index=False)
19
+
20
+ # Private files (for grading)
21
+ test_answer.to_csv(private / "test.csv", index=False)
22
+
23
+ # Validation checks
24
+ assert len(test_answer) == len(sample_submission), \
25
+ f"Test answer ({len(test_answer)}) and sample submission ({len(sample_submission)}) must have same length"