dslighting 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (352) hide show
  1. dslighting/__init__.py +1 -1
  2. dslighting/core/agent.py +78 -62
  3. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/METADATA +1 -1
  4. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/RECORD +352 -7
  5. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/top_level.txt +1 -0
  6. mlebench/README.md +39 -0
  7. mlebench/__init__.py +0 -0
  8. mlebench/cli.py +221 -0
  9. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/grade.py +161 -0
  10. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/mAP_evaluation.py +425 -0
  11. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare.py +483 -0
  12. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare_val.py +719 -0
  13. mlebench/competitions/AI4Code/grade.py +70 -0
  14. mlebench/competitions/AI4Code/prepare.py +84 -0
  15. mlebench/competitions/AI4Code/prepare_val.py +159 -0
  16. mlebench/competitions/__init__.py +0 -0
  17. mlebench/competitions/aerial-cactus-identification/grade.py +11 -0
  18. mlebench/competitions/aerial-cactus-identification/prepare.py +71 -0
  19. mlebench/competitions/aerial-cactus-identification/prepare_val.py +133 -0
  20. mlebench/competitions/alaska2-image-steganalysis/grade.py +136 -0
  21. mlebench/competitions/alaska2-image-steganalysis/prepare.py +88 -0
  22. mlebench/competitions/alaska2-image-steganalysis/prepare_val.py +148 -0
  23. mlebench/competitions/aptos2019-blindness-detection/grade.py +35 -0
  24. mlebench/competitions/aptos2019-blindness-detection/prepare.py +75 -0
  25. mlebench/competitions/aptos2019-blindness-detection/prepare_val.py +123 -0
  26. mlebench/competitions/bike-sharing-demand/__init__.py +0 -0
  27. mlebench/competitions/bike-sharing-demand/grade.py +55 -0
  28. mlebench/competitions/bike-sharing-demand/prepare.py +37 -0
  29. mlebench/competitions/billion-word-imputation/grade.py +37 -0
  30. mlebench/competitions/billion-word-imputation/prepare.py +107 -0
  31. mlebench/competitions/billion-word-imputation/prepare_val.py +179 -0
  32. mlebench/competitions/bms-molecular-translation/grade.py +40 -0
  33. mlebench/competitions/bms-molecular-translation/prepare.py +68 -0
  34. mlebench/competitions/bms-molecular-translation/prepare_val.py +131 -0
  35. mlebench/competitions/cassava-leaf-disease-classification/grade.py +12 -0
  36. mlebench/competitions/cassava-leaf-disease-classification/prepare.py +113 -0
  37. mlebench/competitions/cassava-leaf-disease-classification/prepare_val.py +186 -0
  38. mlebench/competitions/cdiscount-image-classification-challenge/grade.py +11 -0
  39. mlebench/competitions/cdiscount-image-classification-challenge/prepare.py +144 -0
  40. mlebench/competitions/cdiscount-image-classification-challenge/prepare_val.py +205 -0
  41. mlebench/competitions/chaii-hindi-and-tamil-question-answering/grade.py +67 -0
  42. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare.py +31 -0
  43. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare_val.py +94 -0
  44. mlebench/competitions/champs-scalar-coupling/grade.py +60 -0
  45. mlebench/competitions/champs-scalar-coupling/prepare.py +116 -0
  46. mlebench/competitions/champs-scalar-coupling/prepare_val.py +155 -0
  47. mlebench/competitions/conways-reverse-game-of-life-2020/__init__.py +0 -0
  48. mlebench/competitions/conways-reverse-game-of-life-2020/grade.py +40 -0
  49. mlebench/competitions/conways-reverse-game-of-life-2020/prepare.py +41 -0
  50. mlebench/competitions/demand-forecasting-kernels-only/__init__.py +0 -0
  51. mlebench/competitions/demand-forecasting-kernels-only/grade.py +66 -0
  52. mlebench/competitions/demand-forecasting-kernels-only/prepare.py +27 -0
  53. mlebench/competitions/demand_forecasting_kernels_only/__init__.py +0 -0
  54. mlebench/competitions/demand_forecasting_kernels_only/grade.py +66 -0
  55. mlebench/competitions/demand_forecasting_kernels_only/prepare.py +27 -0
  56. mlebench/competitions/denoising-dirty-documents/grade.py +44 -0
  57. mlebench/competitions/denoising-dirty-documents/prepare.py +134 -0
  58. mlebench/competitions/denoising-dirty-documents/prepare_val.py +178 -0
  59. mlebench/competitions/detecting-insults-in-social-commentary/grade.py +11 -0
  60. mlebench/competitions/detecting-insults-in-social-commentary/prepare.py +72 -0
  61. mlebench/competitions/detecting-insults-in-social-commentary/prepare_val.py +128 -0
  62. mlebench/competitions/dog-breed-identification/dogs.py +124 -0
  63. mlebench/competitions/dog-breed-identification/grade.py +42 -0
  64. mlebench/competitions/dog-breed-identification/prepare.py +55 -0
  65. mlebench/competitions/dog-breed-identification/prepare_val.py +104 -0
  66. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/grade.py +43 -0
  67. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare.py +70 -0
  68. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare_val.py +143 -0
  69. mlebench/competitions/ethanol-concentration/grade.py +23 -0
  70. mlebench/competitions/ethanol-concentration/prepare.py +90 -0
  71. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/grade.py +60 -0
  72. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare.py +41 -0
  73. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare_val.py +92 -0
  74. mlebench/competitions/feedback-prize-english-language-learning/__init__.py +0 -0
  75. mlebench/competitions/feedback-prize-english-language-learning/grade.py +60 -0
  76. mlebench/competitions/feedback-prize-english-language-learning/prepare.py +39 -0
  77. mlebench/competitions/freesound-audio-tagging-2019/grade.py +64 -0
  78. mlebench/competitions/freesound-audio-tagging-2019/prepare.py +94 -0
  79. mlebench/competitions/freesound-audio-tagging-2019/prepare_val.py +175 -0
  80. mlebench/competitions/freesound-audio-tagging-2019/vocabulary.py +83 -0
  81. mlebench/competitions/google-quest-challenge/classes.py +32 -0
  82. mlebench/competitions/google-quest-challenge/grade.py +45 -0
  83. mlebench/competitions/google-quest-challenge/prepare.py +58 -0
  84. mlebench/competitions/google-quest-challenge/prepare_val.py +120 -0
  85. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/grade.py +77 -0
  86. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare.py +155 -0
  87. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare_val.py +211 -0
  88. mlebench/competitions/h-and-m-personalized-fashion-recommendations/grade.py +42 -0
  89. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare.py +102 -0
  90. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare_val.py +132 -0
  91. mlebench/competitions/handwriting/grade.py +23 -0
  92. mlebench/competitions/handwriting/prepare.py +179 -0
  93. mlebench/competitions/herbarium-2020-fgvc7/grade.py +34 -0
  94. mlebench/competitions/herbarium-2020-fgvc7/prepare.py +251 -0
  95. mlebench/competitions/herbarium-2020-fgvc7/prepare_val.py +242 -0
  96. mlebench/competitions/herbarium-2021-fgvc8/grade.py +34 -0
  97. mlebench/competitions/herbarium-2021-fgvc8/prepare.py +251 -0
  98. mlebench/competitions/herbarium-2021-fgvc8/prepare_val.py +222 -0
  99. mlebench/competitions/herbarium-2022-fgvc9/grade.py +31 -0
  100. mlebench/competitions/herbarium-2022-fgvc9/prepare.py +233 -0
  101. mlebench/competitions/herbarium-2022-fgvc9/prepare_val.py +213 -0
  102. mlebench/competitions/histopathologic-cancer-detection/grade.py +12 -0
  103. mlebench/competitions/histopathologic-cancer-detection/prepare.py +59 -0
  104. mlebench/competitions/histopathologic-cancer-detection/prepare_val.py +131 -0
  105. mlebench/competitions/hms-harmful-brain-activity-classification/constants.py +9 -0
  106. mlebench/competitions/hms-harmful-brain-activity-classification/grade.py +43 -0
  107. mlebench/competitions/hms-harmful-brain-activity-classification/kaggle_metric_utilities.py +96 -0
  108. mlebench/competitions/hms-harmful-brain-activity-classification/kullback_leibler_divergence.py +118 -0
  109. mlebench/competitions/hms-harmful-brain-activity-classification/prepare.py +121 -0
  110. mlebench/competitions/hms-harmful-brain-activity-classification/prepare_val.py +190 -0
  111. mlebench/competitions/hotel-id-2021-fgvc8/grade.py +41 -0
  112. mlebench/competitions/hotel-id-2021-fgvc8/prepare.py +63 -0
  113. mlebench/competitions/hotel-id-2021-fgvc8/prepare_val.py +132 -0
  114. mlebench/competitions/hubmap-kidney-segmentation/grade.py +62 -0
  115. mlebench/competitions/hubmap-kidney-segmentation/prepare.py +108 -0
  116. mlebench/competitions/hubmap-kidney-segmentation/prepare_val.py +153 -0
  117. mlebench/competitions/icecube-neutrinos-in-deep-ice/grade.py +111 -0
  118. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare.py +127 -0
  119. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare_val.py +183 -0
  120. mlebench/competitions/ili/grade.py +60 -0
  121. mlebench/competitions/ili/prepare.py +99 -0
  122. mlebench/competitions/imet-2020-fgvc7/grade.py +54 -0
  123. mlebench/competitions/imet-2020-fgvc7/prepare.py +77 -0
  124. mlebench/competitions/imet-2020-fgvc7/prepare_val.py +157 -0
  125. mlebench/competitions/inaturalist-2019-fgvc6/grade.py +35 -0
  126. mlebench/competitions/inaturalist-2019-fgvc6/prepare.py +259 -0
  127. mlebench/competitions/inaturalist-2019-fgvc6/prepare_val.py +304 -0
  128. mlebench/competitions/instant-gratification/__init__.py +0 -0
  129. mlebench/competitions/instant-gratification/grade.py +55 -0
  130. mlebench/competitions/instant-gratification/prepare.py +25 -0
  131. mlebench/competitions/instant_gratification/__init__.py +0 -0
  132. mlebench/competitions/instant_gratification/grade.py +55 -0
  133. mlebench/competitions/instant_gratification/prepare.py +25 -0
  134. mlebench/competitions/invasive-species-monitoring/grade.py +11 -0
  135. mlebench/competitions/invasive-species-monitoring/prepare.py +97 -0
  136. mlebench/competitions/invasive-species-monitoring/prepare_val.py +164 -0
  137. mlebench/competitions/iwildcam-2019-fgvc6/grade.py +44 -0
  138. mlebench/competitions/iwildcam-2019-fgvc6/prepare.py +118 -0
  139. mlebench/competitions/iwildcam-2019-fgvc6/prepare_val.py +194 -0
  140. mlebench/competitions/iwildcam-2020-fgvc7/grade.py +11 -0
  141. mlebench/competitions/iwildcam-2020-fgvc7/prepare.py +164 -0
  142. mlebench/competitions/iwildcam-2020-fgvc7/prepare_val.py +245 -0
  143. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/classes.py +1 -0
  144. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/grade.py +54 -0
  145. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare.py +42 -0
  146. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare_val.py +88 -0
  147. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/grade.py +153 -0
  148. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare.py +36 -0
  149. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare_val.py +117 -0
  150. mlebench/competitions/kuzushiji-recognition/grade.py +58 -0
  151. mlebench/competitions/kuzushiji-recognition/kuzushiji_metric.py +118 -0
  152. mlebench/competitions/kuzushiji-recognition/prepare.py +92 -0
  153. mlebench/competitions/kuzushiji-recognition/prepare_val.py +149 -0
  154. mlebench/competitions/leaf-classification/classes.py +101 -0
  155. mlebench/competitions/leaf-classification/grade.py +44 -0
  156. mlebench/competitions/leaf-classification/prepare.py +60 -0
  157. mlebench/competitions/leaf-classification/prepare_val.py +116 -0
  158. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/grade.py +44 -0
  159. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare.py +51 -0
  160. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare_val.py +96 -0
  161. mlebench/competitions/liverpool-ion-switching/__init__.py +0 -0
  162. mlebench/competitions/liverpool-ion-switching/grade.py +52 -0
  163. mlebench/competitions/liverpool-ion-switching/prepare.py +27 -0
  164. mlebench/competitions/liverpool_ion_switching/__init__.py +0 -0
  165. mlebench/competitions/liverpool_ion_switching/grade.py +52 -0
  166. mlebench/competitions/liverpool_ion_switching/prepare.py +27 -0
  167. mlebench/competitions/lmsys-chatbot-arena/grade.py +63 -0
  168. mlebench/competitions/lmsys-chatbot-arena/prepare.py +52 -0
  169. mlebench/competitions/lmsys-chatbot-arena/prepare_val.py +115 -0
  170. mlebench/competitions/mcm_2024_c_test/grade.py +107 -0
  171. mlebench/competitions/mcm_2024_c_test/prepare.py +2 -0
  172. mlebench/competitions/ml2021spring-hw2/grade.py +11 -0
  173. mlebench/competitions/ml2021spring-hw2/prepare.py +58 -0
  174. mlebench/competitions/ml2021spring-hw2/prepare_val.py +135 -0
  175. mlebench/competitions/mlsp-2013-birds/grade.py +11 -0
  176. mlebench/competitions/mlsp-2013-birds/prepare.py +182 -0
  177. mlebench/competitions/mlsp-2013-birds/prepare_val.py +241 -0
  178. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/grade.py +11 -0
  179. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare.py +58 -0
  180. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare_val.py +120 -0
  181. mlebench/competitions/multi-modal-gesture-recognition/grade.py +58 -0
  182. mlebench/competitions/multi-modal-gesture-recognition/prepare.py +85 -0
  183. mlebench/competitions/multi-modal-gesture-recognition/prepare_val.py +139 -0
  184. mlebench/competitions/my-custom-task-01/prepare.py +2 -0
  185. mlebench/competitions/new-my-task-01/prepare.py +2 -0
  186. mlebench/competitions/new-my-task-03/grade.py +107 -0
  187. mlebench/competitions/new-my-task-03/prepare.py +2 -0
  188. mlebench/competitions/new-york-city-taxi-fare-prediction/grade.py +28 -0
  189. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare.py +44 -0
  190. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare_val.py +89 -0
  191. mlebench/competitions/nfl-player-contact-detection/grade.py +36 -0
  192. mlebench/competitions/nfl-player-contact-detection/prepare.py +101 -0
  193. mlebench/competitions/nfl-player-contact-detection/prepare_val.py +186 -0
  194. mlebench/competitions/nomad2018-predict-transparent-conductors/grade.py +47 -0
  195. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare.py +77 -0
  196. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare_val.py +144 -0
  197. mlebench/competitions/osic-pulmonary-fibrosis-progression/grade.py +74 -0
  198. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare.py +95 -0
  199. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare_val.py +167 -0
  200. mlebench/competitions/paddy-disease-classification/grade.py +35 -0
  201. mlebench/competitions/paddy-disease-classification/prepare.py +69 -0
  202. mlebench/competitions/paddy-disease-classification/prepare_val.py +122 -0
  203. mlebench/competitions/petfinder-pawpularity-score/grade.py +41 -0
  204. mlebench/competitions/petfinder-pawpularity-score/prepare.py +76 -0
  205. mlebench/competitions/petfinder-pawpularity-score/prepare_val.py +154 -0
  206. mlebench/competitions/plant-pathology-2020-fgvc7/grade.py +41 -0
  207. mlebench/competitions/plant-pathology-2020-fgvc7/prepare.py +74 -0
  208. mlebench/competitions/plant-pathology-2020-fgvc7/prepare_val.py +160 -0
  209. mlebench/competitions/plant-pathology-2021-fgvc8/grade.py +54 -0
  210. mlebench/competitions/plant-pathology-2021-fgvc8/prepare.py +65 -0
  211. mlebench/competitions/plant-pathology-2021-fgvc8/prepare_val.py +130 -0
  212. mlebench/competitions/plant-seedlings-classification/grade.py +39 -0
  213. mlebench/competitions/plant-seedlings-classification/prepare.py +91 -0
  214. mlebench/competitions/plant-seedlings-classification/prepare_val.py +158 -0
  215. mlebench/competitions/playground-series-s3e1/__init__.py +0 -0
  216. mlebench/competitions/playground-series-s3e1/grade.py +52 -0
  217. mlebench/competitions/playground-series-s3e1/prepare.py +25 -0
  218. mlebench/competitions/playground-series-s3e11/__init__.py +0 -0
  219. mlebench/competitions/playground-series-s3e11/grade.py +55 -0
  220. mlebench/competitions/playground-series-s3e11/prepare.py +25 -0
  221. mlebench/competitions/playground-series-s3e18/grade.py +39 -0
  222. mlebench/competitions/playground-series-s3e18/prepare.py +36 -0
  223. mlebench/competitions/playground-series-s3e18/prepare_val.py +89 -0
  224. mlebench/competitions/playground_series_s3e1/__init__.py +0 -0
  225. mlebench/competitions/playground_series_s3e1/grade.py +52 -0
  226. mlebench/competitions/playground_series_s3e1/prepare.py +25 -0
  227. mlebench/competitions/playground_series_s3e11/__init__.py +0 -0
  228. mlebench/competitions/playground_series_s3e11/grade.py +55 -0
  229. mlebench/competitions/playground_series_s3e11/prepare.py +25 -0
  230. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/grade.py +44 -0
  231. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare.py +68 -0
  232. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare_val.py +146 -0
  233. mlebench/competitions/random-acts-of-pizza/grade.py +14 -0
  234. mlebench/competitions/random-acts-of-pizza/prepare.py +80 -0
  235. mlebench/competitions/random-acts-of-pizza/prepare_val.py +144 -0
  236. mlebench/competitions/ranzcr-clip-catheter-line-classification/classes.py +11 -0
  237. mlebench/competitions/ranzcr-clip-catheter-line-classification/grade.py +31 -0
  238. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare.py +53 -0
  239. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare_val.py +113 -0
  240. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/grade.py +124 -0
  241. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare.py +219 -0
  242. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare_val.py +257 -0
  243. mlebench/competitions/rsna-breast-cancer-detection/grade.py +65 -0
  244. mlebench/competitions/rsna-breast-cancer-detection/prepare.py +141 -0
  245. mlebench/competitions/rsna-breast-cancer-detection/prepare_val.py +201 -0
  246. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/grade.py +13 -0
  247. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare.py +47 -0
  248. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare_val.py +97 -0
  249. mlebench/competitions/santander-customer-satisfaction/grade.py +10 -0
  250. mlebench/competitions/santander-customer-satisfaction/prepare.py +41 -0
  251. mlebench/competitions/sciencebench-001-clintox-nn/__init__.py +0 -0
  252. mlebench/competitions/sciencebench-001-clintox-nn/grade.py +56 -0
  253. mlebench/competitions/sciencebench-001-clintox-nn/prepare.py +75 -0
  254. mlebench/competitions/sciencebench-015-aai/grade.py +37 -0
  255. mlebench/competitions/sciencebench-015-aai/prepare.py +102 -0
  256. mlebench/competitions/sciencebench-051-brain-blood-qsar/grade.py +58 -0
  257. mlebench/competitions/sciencebench-051-brain-blood-qsar/prepare.py +69 -0
  258. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/grade.py +55 -0
  259. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/prepare.py +88 -0
  260. mlebench/competitions/see-click-predict-fix/__init__.py +0 -0
  261. mlebench/competitions/see-click-predict-fix/grade.py +66 -0
  262. mlebench/competitions/see-click-predict-fix/prepare.py +25 -0
  263. mlebench/competitions/see_click_predict_fix/__init__.py +0 -0
  264. mlebench/competitions/see_click_predict_fix/grade.py +66 -0
  265. mlebench/competitions/see_click_predict_fix/prepare.py +25 -0
  266. mlebench/competitions/seti-breakthrough-listen/grade.py +11 -0
  267. mlebench/competitions/seti-breakthrough-listen/prepare.py +71 -0
  268. mlebench/competitions/seti-breakthrough-listen/prepare_val.py +159 -0
  269. mlebench/competitions/siim-covid19-detection/grade.py +194 -0
  270. mlebench/competitions/siim-covid19-detection/prepare.py +123 -0
  271. mlebench/competitions/siim-covid19-detection/prepare_val.py +164 -0
  272. mlebench/competitions/siim-isic-melanoma-classification/grade.py +11 -0
  273. mlebench/competitions/siim-isic-melanoma-classification/prepare.py +127 -0
  274. mlebench/competitions/siim-isic-melanoma-classification/prepare_val.py +158 -0
  275. mlebench/competitions/smartphone-decimeter-2022/grade.py +55 -0
  276. mlebench/competitions/smartphone-decimeter-2022/notebook.py +86 -0
  277. mlebench/competitions/smartphone-decimeter-2022/prepare.py +143 -0
  278. mlebench/competitions/smartphone-decimeter-2022/prepare_val.py +199 -0
  279. mlebench/competitions/spaceship-titanic/grade.py +11 -0
  280. mlebench/competitions/spaceship-titanic/prepare.py +23 -0
  281. mlebench/competitions/spaceship-titanic/prepare_val.py +61 -0
  282. mlebench/competitions/spooky-author-identification/classes.py +1 -0
  283. mlebench/competitions/spooky-author-identification/grade.py +38 -0
  284. mlebench/competitions/spooky-author-identification/prepare.py +40 -0
  285. mlebench/competitions/spooky-author-identification/prepare_val.py +78 -0
  286. mlebench/competitions/stanford-covid-vaccine/grade.py +65 -0
  287. mlebench/competitions/stanford-covid-vaccine/prepare.py +129 -0
  288. mlebench/competitions/stanford-covid-vaccine/prepare_val.py +199 -0
  289. mlebench/competitions/statoil-iceberg-classifier-challenge/grade.py +41 -0
  290. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare.py +105 -0
  291. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare_val.py +157 -0
  292. mlebench/competitions/tabular-playground-series-dec-2021/grade.py +11 -0
  293. mlebench/competitions/tabular-playground-series-dec-2021/prepare.py +39 -0
  294. mlebench/competitions/tabular-playground-series-dec-2021/prepare_val.py +99 -0
  295. mlebench/competitions/tabular-playground-series-may-2022/grade.py +9 -0
  296. mlebench/competitions/tabular-playground-series-may-2022/prepare.py +56 -0
  297. mlebench/competitions/tabular-playground-series-may-2022/prepare_val.py +116 -0
  298. mlebench/competitions/tensorflow-speech-recognition-challenge/grade.py +11 -0
  299. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare.py +90 -0
  300. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare_val.py +148 -0
  301. mlebench/competitions/tensorflow2-question-answering/grade.py +122 -0
  302. mlebench/competitions/tensorflow2-question-answering/prepare.py +122 -0
  303. mlebench/competitions/tensorflow2-question-answering/prepare_val.py +187 -0
  304. mlebench/competitions/text-normalization-challenge-english-language/grade.py +49 -0
  305. mlebench/competitions/text-normalization-challenge-english-language/prepare.py +115 -0
  306. mlebench/competitions/text-normalization-challenge-english-language/prepare_val.py +213 -0
  307. mlebench/competitions/text-normalization-challenge-russian-language/grade.py +49 -0
  308. mlebench/competitions/text-normalization-challenge-russian-language/prepare.py +113 -0
  309. mlebench/competitions/text-normalization-challenge-russian-language/prepare_val.py +165 -0
  310. mlebench/competitions/tgs-salt-identification-challenge/grade.py +144 -0
  311. mlebench/competitions/tgs-salt-identification-challenge/prepare.py +158 -0
  312. mlebench/competitions/tgs-salt-identification-challenge/prepare_val.py +166 -0
  313. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/grade.py +11 -0
  314. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare.py +95 -0
  315. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare_val.py +141 -0
  316. mlebench/competitions/tmdb-box-office-prediction/__init__.py +0 -0
  317. mlebench/competitions/tmdb-box-office-prediction/grade.py +55 -0
  318. mlebench/competitions/tmdb-box-office-prediction/prepare.py +35 -0
  319. mlebench/competitions/tweet-sentiment-extraction/grade.py +67 -0
  320. mlebench/competitions/tweet-sentiment-extraction/prepare.py +36 -0
  321. mlebench/competitions/tweet-sentiment-extraction/prepare_val.py +106 -0
  322. mlebench/competitions/us-patent-phrase-to-phrase-matching/grade.py +31 -0
  323. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare.py +33 -0
  324. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare_val.py +71 -0
  325. mlebench/competitions/utils.py +266 -0
  326. mlebench/competitions/uw-madison-gi-tract-image-segmentation/grade.py +158 -0
  327. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare.py +139 -0
  328. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare_val.py +193 -0
  329. mlebench/competitions/ventilator-pressure-prediction/__init__.py +0 -0
  330. mlebench/competitions/ventilator-pressure-prediction/grade.py +52 -0
  331. mlebench/competitions/ventilator-pressure-prediction/prepare.py +27 -0
  332. mlebench/competitions/ventilator-pressure-prediction/prepare_val.py +142 -0
  333. mlebench/competitions/ventilator_pressure_prediction/__init__.py +0 -0
  334. mlebench/competitions/ventilator_pressure_prediction/grade.py +52 -0
  335. mlebench/competitions/ventilator_pressure_prediction/prepare.py +27 -0
  336. mlebench/competitions/vesuvius-challenge-ink-detection/grade.py +97 -0
  337. mlebench/competitions/vesuvius-challenge-ink-detection/prepare.py +122 -0
  338. mlebench/competitions/vesuvius-challenge-ink-detection/prepare_val.py +170 -0
  339. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/grade.py +220 -0
  340. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare.py +129 -0
  341. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare_val.py +204 -0
  342. mlebench/competitions/whale-categorization-playground/grade.py +41 -0
  343. mlebench/competitions/whale-categorization-playground/prepare.py +103 -0
  344. mlebench/competitions/whale-categorization-playground/prepare_val.py +196 -0
  345. mlebench/data.py +420 -0
  346. mlebench/grade.py +209 -0
  347. mlebench/grade_helpers.py +235 -0
  348. mlebench/metrics.py +75 -0
  349. mlebench/registry.py +332 -0
  350. mlebench/utils.py +346 -0
  351. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/WHEEL +0 -0
  352. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,55 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.metrics import roc_auc_score
4
+
5
+ from mlebench.grade_helpers import InvalidSubmissionError
6
+
7
+
8
+ def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame):
9
+ """Prepare submission and answers for AUC-ROC calculation."""
10
+ if len(submission) != len(answers):
11
+ raise InvalidSubmissionError(
12
+ f"Submission length ({len(submission)}) != answers length ({len(answers)})"
13
+ )
14
+
15
+ if "id" not in submission.columns:
16
+ raise InvalidSubmissionError("Submission must have an 'id' column")
17
+
18
+ if "target" not in submission.columns:
19
+ raise InvalidSubmissionError("Submission must have a 'target' column")
20
+
21
+ # Sort by id
22
+ submission = submission.sort_values("id").reset_index(drop=True)
23
+ answers = answers.sort_values("id").reset_index(drop=True)
24
+
25
+ # Check id alignment
26
+ if not (submission["id"] == answers["id"]).all():
27
+ raise InvalidSubmissionError("Submission and answers id mismatch")
28
+
29
+ # Extract predictions and true values
30
+ y_pred = submission["target"].values
31
+ y_true = answers["target"].values
32
+
33
+ # Validate predictions
34
+ try:
35
+ y_pred = y_pred.astype(float)
36
+ except (ValueError, TypeError):
37
+ raise InvalidSubmissionError("Predictions must be numeric")
38
+
39
+ if np.any(np.isnan(y_pred)):
40
+ raise InvalidSubmissionError("Predictions cannot contain NaN values")
41
+
42
+ if np.any((y_pred < 0) | (y_pred > 1)):
43
+ raise InvalidSubmissionError("Predictions must be probabilities between 0 and 1")
44
+
45
+ return y_true, y_pred
46
+
47
+
48
+ def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
49
+ """Calculate AUC-ROC score."""
50
+ y_true, y_pred = prepare_for_metric(submission, answers)
51
+
52
+ # AUC-ROC
53
+ auc = roc_auc_score(y_true, y_pred)
54
+
55
+ return auc
@@ -0,0 +1,25 @@
1
+ from pathlib import Path
2
+ import pandas as pd
3
+
4
+
5
+ def prepare(raw: Path, public: Path, private: Path):
6
+ """
7
+ Prepare instant-gratification dataset.
8
+ """
9
+ # Read data
10
+ train = pd.read_csv(raw / "train.csv")
11
+ test = pd.read_csv(raw / "test.csv")
12
+ sample_submission = pd.read_csv(raw / "sample_submission.csv")
13
+ test_answer = pd.read_csv(raw / "test_answer.csv")
14
+
15
+ # Public files (visible to agents)
16
+ train.to_csv(public / "train.csv", index=False)
17
+ test.to_csv(public / "test.csv", index=False)
18
+ sample_submission.to_csv(public / "sample_submission.csv", index=False)
19
+
20
+ # Private files (for grading)
21
+ test_answer.to_csv(private / "test.csv", index=False)
22
+
23
+ # Validation checks
24
+ assert len(test_answer) == len(sample_submission), \
25
+ f"Test answer ({len(test_answer)}) and sample submission ({len(sample_submission)}) must have same length"
@@ -0,0 +1,11 @@
1
+ import pandas as pd
2
+ from sklearn.metrics import roc_auc_score
3
+
4
+ from mlebench.competitions.utils import prepare_for_auroc_metric
5
+
6
+
7
+ def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
8
+ roc_auc_inputs = prepare_for_auroc_metric(
9
+ submission=submission, answers=answers, id_col="name", target_col="invasive"
10
+ )
11
+ return roc_auc_score(y_true=roc_auc_inputs["y_true"], y_score=roc_auc_inputs["y_score"])
@@ -0,0 +1,97 @@
1
+ import shutil
2
+ from pathlib import Path
3
+
4
+ import py7zr
5
+ from sklearn.model_selection import train_test_split
6
+ from tqdm import tqdm
7
+
8
+ from mlebench.utils import extract, read_csv
9
+
10
+
11
+ def prepare(raw: Path, public: Path, private: Path):
12
+ """
13
+ Splits the data in raw into public and private datasets with appropriate test/train splits.
14
+ """
15
+ # extract only what we need
16
+ extract(raw / "train.7z", raw)
17
+ extract(raw / "train_labels.csv.zip", raw)
18
+
19
+ # Create train, test from train split
20
+ # Original ratio is 1531/(1531+2295) = 0.4
21
+ test_ratio = 0.2
22
+ old_train = read_csv(raw / "train_labels.csv")
23
+ new_train, answers = train_test_split(old_train, test_size=test_ratio, random_state=0)
24
+
25
+ # Sample submission
26
+ sample_submission = answers.copy()
27
+ sample_submission["invasive"] = 0.5
28
+
29
+ # Checks
30
+ assert new_train["name"].is_unique, "new_train should have unique names"
31
+ assert answers["name"].is_unique, "answers should have unique names"
32
+ assert set(new_train["name"]).isdisjoint(
33
+ set(answers["name"])
34
+ ), "new_train and answers should be disjoint"
35
+ assert len(new_train) + len(answers) == len(
36
+ old_train
37
+ ), "new_train and answers together should have the same number of rows as old_train"
38
+ assert (
39
+ new_train.columns.tolist() == old_train.columns.tolist()
40
+ ), "new_train should have the same columns as old_train"
41
+ assert (
42
+ answers.columns.tolist() == old_train.columns.tolist()
43
+ ), "answers should have the same columns as old_train"
44
+ assert (
45
+ sample_submission.columns.tolist() == old_train.columns.tolist()
46
+ ), "sample_submission should have the same columns as old_train"
47
+
48
+ # Write CSVs
49
+ answers.to_csv(private / "answers.csv", index=False)
50
+ new_train.to_csv(public / "train_labels.csv", index=False)
51
+ sample_submission.to_csv(private / "sample_submission.csv", index=False)
52
+ sample_submission.to_csv(public / "sample_submission.csv", index=False)
53
+
54
+ # Copy files
55
+ (public / "train").mkdir(exist_ok=True)
56
+ (public / "test").mkdir(exist_ok=True)
57
+
58
+ for file_id in tqdm(new_train["name"], desc="Copying Train Images"):
59
+ shutil.copyfile(
60
+ src=raw / "train" / f"{file_id}.jpg",
61
+ dst=public / "train" / f"{file_id}.jpg",
62
+ )
63
+
64
+ for file_id in tqdm(answers["name"], desc="Copying Test Images"):
65
+ shutil.copyfile(
66
+ src=raw / "train" / f"{file_id}.jpg",
67
+ dst=public / "test" / f"{file_id}.jpg",
68
+ )
69
+
70
+ # Checks
71
+ assert len(list((public / "train").glob("*.jpg"))) == len(
72
+ new_train
73
+ ), "public/train should have the same number of files as new_train"
74
+ assert len(list((public / "test").glob("*.jpg"))) == len(
75
+ answers
76
+ ), "public/test should have the same number of files as answers"
77
+
78
+ # Zip
79
+ shutil.make_archive(
80
+ str(public / "sample_submission.csv"),
81
+ "zip",
82
+ root_dir=public,
83
+ base_dir="sample_submission.csv",
84
+ )
85
+ shutil.make_archive(
86
+ str(public / "train_labels.csv"), "zip", root_dir=public, base_dir="train_labels.csv"
87
+ )
88
+ with py7zr.SevenZipFile(public / "train.7z", "w") as z:
89
+ z.write(public / "train")
90
+ with py7zr.SevenZipFile(public / "test.7z", "w") as z:
91
+ z.write(public / "test")
92
+
93
+ # Delete
94
+ shutil.rmtree(public / "train")
95
+ shutil.rmtree(public / "test")
96
+ (public / "sample_submission.csv").unlink()
97
+ (public / "train_labels.csv").unlink()
@@ -0,0 +1,164 @@
1
+ import shutil
2
+ from pathlib import Path
3
+
4
+ import pandas as pd
5
+ import py7zr
6
+ from sklearn.model_selection import train_test_split
7
+ from tqdm import tqdm
8
+
9
+ from mlebench.utils import extract, read_csv
10
+
11
+
12
+ def _process_split(
13
+ train_df: pd.DataFrame,
14
+ test_df: pd.DataFrame,
15
+ raw_images_dir: Path,
16
+ public_dir: Path,
17
+ private_dir: Path,
18
+ ):
19
+ """
20
+ Helper function to process a single data split (train/test).
21
+
22
+ This function handles:
23
+ - Creating a sample submission.
24
+ - Writing all necessary CSV files to public and private directories.
25
+ - Copying image files to temporary train/test subdirectories.
26
+ - Zipping the final artifacts.
27
+ - Cleaning up temporary files.
28
+ """
29
+ # Create output directories if they don't exist
30
+ public_dir.mkdir(exist_ok=True, parents=True)
31
+ private_dir.mkdir(exist_ok=True, parents=True)
32
+
33
+ # Sample submission
34
+ sample_submission = test_df.copy()
35
+ sample_submission["invasive"] = 0.5
36
+
37
+ # Write CSVs
38
+ test_df.to_csv(private_dir / "answers.csv", index=False)
39
+ train_df.to_csv(public_dir / "train_labels.csv", index=False)
40
+ sample_submission.to_csv(private_dir / "sample_submission.csv", index=False)
41
+ sample_submission.to_csv(public_dir / "sample_submission.csv", index=False)
42
+
43
+ # Create temporary directories for image copying
44
+ public_train_images_dir = public_dir / "train"
45
+ public_test_images_dir = public_dir / "test"
46
+ public_train_images_dir.mkdir(exist_ok=True)
47
+ public_test_images_dir.mkdir(exist_ok=True)
48
+
49
+ # Copy files
50
+ for file_id in tqdm(train_df["name"], desc=f"Copying Train Images to {public_dir.name}"):
51
+ shutil.copyfile(
52
+ src=raw_images_dir / f"{file_id}.jpg",
53
+ dst=public_train_images_dir / f"{file_id}.jpg",
54
+ )
55
+
56
+ for file_id in tqdm(test_df["name"], desc=f"Copying Test Images to {public_dir.name}"):
57
+ shutil.copyfile(
58
+ src=raw_images_dir / f"{file_id}.jpg",
59
+ dst=public_test_images_dir / f"{file_id}.jpg",
60
+ )
61
+
62
+ # Checks
63
+ assert len(list(public_train_images_dir.glob("*.jpg"))) == len(
64
+ train_df
65
+ ), f"{public_dir.name}/train should have the same number of files as its corresponding train df"
66
+ assert len(list(public_test_images_dir.glob("*.jpg"))) == len(
67
+ test_df
68
+ ), f"{public_dir.name}/test should have the same number of files as its corresponding test df"
69
+
70
+ # Zip
71
+ shutil.make_archive(
72
+ str(public_dir / "sample_submission.csv"),
73
+ "zip",
74
+ root_dir=public_dir,
75
+ base_dir="sample_submission.csv",
76
+ )
77
+ shutil.make_archive(
78
+ str(public_dir / "train_labels.csv"),
79
+ "zip",
80
+ root_dir=public_dir,
81
+ base_dir="train_labels.csv",
82
+ )
83
+ with py7zr.SevenZipFile(public_dir / "train.7z", "w") as z:
84
+ z.write(public_train_images_dir, arcname="train")
85
+ with py7zr.SevenZipFile(public_dir / "test.7z", "w") as z:
86
+ z.write(public_test_images_dir, arcname="test")
87
+
88
+ # Delete temporary files and directories
89
+ shutil.rmtree(public_train_images_dir)
90
+ shutil.rmtree(public_test_images_dir)
91
+ (public_dir / "sample_submission.csv").unlink()
92
+ (public_dir / "train_labels.csv").unlink()
93
+
94
+
95
+ def prepare(raw: Path, public: Path, private: Path):
96
+ """
97
+ Splits the data in raw into public and private datasets with appropriate test/train splits.
98
+ Additionally, creates a second, parallel validation split (public_val/private_val).
99
+ """
100
+ # Define paths for the new validation set
101
+ public_val = public.parent / "public_val"
102
+ private_val = private.parent / "private_val"
103
+
104
+ # extract only what we need
105
+ extract(raw / "train.7z", raw)
106
+ extract(raw / "train_labels.csv.zip", raw)
107
+
108
+ # ---- FIRST SPLIT (Original Train/Test) ----
109
+ # Create train, test from train split
110
+ # Original ratio is 1531/(1531+2295) = 0.4
111
+ test_ratio = 0.2
112
+ old_train = read_csv(raw / "train_labels.csv")
113
+ new_train, answers = train_test_split(old_train, test_size=test_ratio, random_state=0)
114
+
115
+ # Checks
116
+ assert new_train["name"].is_unique, "new_train should have unique names"
117
+ assert answers["name"].is_unique, "answers should have unique names"
118
+ assert set(new_train["name"]).isdisjoint(
119
+ set(answers["name"])
120
+ ), "new_train and answers should be disjoint"
121
+ assert len(new_train) + len(answers) == len(
122
+ old_train
123
+ ), "new_train and answers together should have the same number of rows as old_train"
124
+ assert (
125
+ new_train.columns.tolist() == old_train.columns.tolist()
126
+ ), "new_train should have the same columns as old_train"
127
+ assert (
128
+ answers.columns.tolist() == old_train.columns.tolist()
129
+ ), "answers should have the same columns as old_train"
130
+
131
+ # Process and save the original public/private split
132
+ # This ensures the original outputs are not modified
133
+ _process_split(
134
+ train_df=new_train,
135
+ test_df=answers,
136
+ raw_images_dir=raw / "train",
137
+ public_dir=public,
138
+ private_dir=private,
139
+ )
140
+
141
+ # ---- SECOND SPLIT (New Train/Validation) ----
142
+ # Split the `new_train` set again to create a validation set.
143
+ # The new test set (`test_val`) will have the same size as the original test set (`answers`).
144
+ val_test_ratio = len(answers) / len(new_train)
145
+ train_val, test_val = train_test_split(
146
+ new_train, test_size=val_test_ratio, random_state=0
147
+ )
148
+
149
+ # Checks for the validation split
150
+ assert set(train_val["name"]).isdisjoint(
151
+ set(test_val["name"])
152
+ ), "train_val and test_val should be disjoint"
153
+ assert len(train_val) + len(test_val) == len(
154
+ new_train
155
+ ), "train_val and test_val together should have the same number of rows as new_train"
156
+
157
+ # Process and save the new validation split into parallel directories
158
+ _process_split(
159
+ train_df=train_val,
160
+ test_df=test_val,
161
+ raw_images_dir=raw / "train",
162
+ public_dir=public_val,
163
+ private_dir=private_val,
164
+ )
@@ -0,0 +1,44 @@
1
+ import pandas as pd
2
+ from sklearn.metrics import f1_score
3
+
4
+ from mlebench.grade_helpers import InvalidSubmissionError
5
+
6
+
7
+ def prepare_for_metric(
8
+ submission: pd.DataFrame,
9
+ answers: pd.DataFrame,
10
+ id_col: str = "Id",
11
+ target_col: str = "Category",
12
+ ):
13
+ """
14
+ `submission` and `answers` are pd.DataFrame with `id_col` and `target_col` columns
15
+ """
16
+
17
+ # Answers checks
18
+ assert id_col in answers.columns, f"Answers must have '{id_col}' column"
19
+ assert target_col in answers.columns, f"Answers must have '{target_col}' column"
20
+
21
+ # Submission checks
22
+ if len(submission) != len(answers):
23
+ raise InvalidSubmissionError(
24
+ f"Submission length {len(submission)} != answers length {len(answers)}"
25
+ )
26
+ if id_col not in submission.columns:
27
+ raise InvalidSubmissionError(f"Submission must have '{id_col}' column")
28
+ if target_col not in submission.columns:
29
+ raise InvalidSubmissionError(f"Submission must have '{target_col}' column")
30
+
31
+ # Sort submission and answers by "Id"
32
+ submission = submission.sort_values(id_col)
33
+ answers = answers.sort_values(id_col)
34
+ if (submission[id_col].values != answers[id_col].values).any():
35
+ raise InvalidSubmissionError(f"Submission and answers have mismatched '{id_col}' columns")
36
+
37
+ y_true = [int(y) for y in answers[target_col]]
38
+ y_pred = [int(y) for y in submission[target_col]]
39
+ return y_true, y_pred
40
+
41
+
42
+ def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
43
+ y_true, y_pred = prepare_for_metric(submission, answers)
44
+ return f1_score(y_true=y_true, y_pred=y_pred, average="macro")
@@ -0,0 +1,118 @@
1
+ import shutil
2
+ from pathlib import Path
3
+
4
+ from sklearn.model_selection import train_test_split
5
+ from tqdm import tqdm
6
+
7
+ from mlebench.utils import extract, get_logger, read_csv
8
+
9
+ logger = get_logger(__name__)
10
+
11
+
12
+ def prepare(raw: Path, public: Path, private: Path):
13
+ """
14
+ Splits the data in raw into public and private datasets with appropriate test/train splits.
15
+ """
16
+
17
+ dev_mode = False
18
+
19
+ # Create train, test from train split
20
+ # Original split ratio is 153730/(196157+153730) = 0.44
21
+ # We use 0.1 so as to not take out too many samples from train
22
+ test_size = 0.1
23
+ old_train = read_csv(raw / "train.csv")
24
+ # Create a new column 'split' and assign it randomly to 'test' or 'train' based on the value of the 'location' column
25
+ locations = old_train["location"].unique()
26
+ train_locations, test_locations = train_test_split(
27
+ locations, test_size=test_size, random_state=8
28
+ ) # We target a 44% test set size, we have empirically trialed seeds and landed on 8 to achieve this
29
+
30
+ old_train["split"] = old_train["location"].apply(
31
+ lambda loc: "test" if loc in test_locations else "train"
32
+ )
33
+
34
+ new_train = old_train[old_train["split"] == "train"].drop(columns=["split"])
35
+ answers = old_train[old_train["split"] == "test"].drop(columns=["split"])
36
+
37
+ logger.debug("Train locations: %s", train_locations)
38
+ logger.debug("Test locations: %s", test_locations)
39
+ logger.debug("Test size: %s", len(answers) / (len(new_train) + len(answers)))
40
+
41
+ old_train = old_train.drop(columns=["split"]) # Drop helper column
42
+
43
+ new_test = answers.copy().drop(columns=["category_id"])
44
+ gold_submission = answers.copy()[["id", "category_id"]]
45
+ gold_submission.rename(columns={"id": "Id", "category_id": "Category"}, inplace=True)
46
+
47
+ # Extract only what we need
48
+ (raw / "train_images").mkdir(exist_ok=True)
49
+ extract(raw / "train_images.zip", raw / "train_images")
50
+ assert len(list(raw.glob("train_images/*.jpg"))) == len(
51
+ old_train["id"].unique()
52
+ ), f"Raw train images should have the same number of images as the unique ids in the old train set, but got {len(list(raw.glob('train_images/*.jpg')))} files and {len(old_train['id'].unique())} ids"
53
+
54
+ # Make sample submission
55
+ submission_df = new_test.copy()[["id"]]
56
+ submission_df["category_id"] = 0
57
+ submission_df.rename(columns={"id": "Id", "category_id": "Category"}, inplace=True)
58
+
59
+ # Checks
60
+ assert set(new_train["id"]).isdisjoint(
61
+ set(new_test["id"])
62
+ ), "new_train and new_test are not disjoint"
63
+ assert len(new_train) + len(new_test) == len(
64
+ old_train
65
+ ), "Length of new_train and new_test should be equal to the length of the original train set"
66
+ assert len(answers) == len(
67
+ new_test
68
+ ), "Length of answers should be equal to the length of new_test"
69
+ assert len(submission_df) == len(
70
+ answers
71
+ ), "Length of answers should be equal to the length of the sample submission"
72
+ assert (
73
+ old_train.columns.tolist() == new_train.columns.tolist()
74
+ ), f"new_train should have the same columns as the original train set: old_train: {old_train.columns.tolist()} != new_train: {new_train.columns.tolist()}"
75
+ assert set(new_train["location"]).isdisjoint(
76
+ set(new_test["location"])
77
+ ), "new_train and new_test should not share any locations"
78
+
79
+ # Write CSVs
80
+ answers.to_csv(private / "test.csv", index=False)
81
+ gold_submission.to_csv(private / "answers.csv", index=False)
82
+ new_train.to_csv(public / "train.csv", index=False)
83
+ new_test.to_csv(public / "test.csv", index=False)
84
+ submission_df.to_csv(public / "sample_submission.csv", index=True)
85
+
86
+ # Copy files
87
+ (public / "train_images").mkdir(exist_ok=True)
88
+ (public / "test_images").mkdir(exist_ok=True)
89
+
90
+ if dev_mode:
91
+ new_train = new_train.sample(n=100)
92
+ new_test = new_test.sample(n=100)
93
+
94
+ for file_id in tqdm(new_train["id"], desc="Copying train images"):
95
+ shutil.copyfile(
96
+ src=raw / "train_images" / f"{file_id}.jpg",
97
+ dst=public / "train_images" / f"{file_id}.jpg",
98
+ )
99
+
100
+ for file_id in tqdm(new_test["id"], desc="Copying test images"):
101
+ shutil.copyfile(
102
+ src=raw / "train_images" / f"{file_id}.jpg",
103
+ dst=public / "test_images" / f"{file_id}.jpg",
104
+ )
105
+
106
+ # Check integrity of the files copied
107
+ assert len(list(public.glob("test_images/*.jpg"))) == len(
108
+ new_test["id"].unique()
109
+ ), f"Public test images should have the same number of images as the unique ids in the test set, but got {len(list(public.glob('test_images/*.jpg')))} files and {len(new_test['id'].unique())} ids"
110
+ assert len(list(public.glob("train_images/*.jpg"))) == len(
111
+ new_train["id"].unique()
112
+ ), f"Public train images should have the same number of images as the unique ids in the train set, but got {len(list(public.glob('train_images/*.jpg')))} files and {len(new_train['id'].unique())} ids"
113
+
114
+ # Zip up image directories and delete non-zipped files
115
+ shutil.make_archive(public / "train_images", "zip", public / "train_images")
116
+ shutil.make_archive(public / "test_images", "zip", public / "test_images")
117
+ shutil.rmtree(public / "train_images")
118
+ shutil.rmtree(public / "test_images")