dslighting 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (352) hide show
  1. dslighting/__init__.py +1 -1
  2. dslighting/core/agent.py +78 -62
  3. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/METADATA +1 -1
  4. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/RECORD +352 -7
  5. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/top_level.txt +1 -0
  6. mlebench/README.md +39 -0
  7. mlebench/__init__.py +0 -0
  8. mlebench/cli.py +221 -0
  9. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/grade.py +161 -0
  10. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/mAP_evaluation.py +425 -0
  11. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare.py +483 -0
  12. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare_val.py +719 -0
  13. mlebench/competitions/AI4Code/grade.py +70 -0
  14. mlebench/competitions/AI4Code/prepare.py +84 -0
  15. mlebench/competitions/AI4Code/prepare_val.py +159 -0
  16. mlebench/competitions/__init__.py +0 -0
  17. mlebench/competitions/aerial-cactus-identification/grade.py +11 -0
  18. mlebench/competitions/aerial-cactus-identification/prepare.py +71 -0
  19. mlebench/competitions/aerial-cactus-identification/prepare_val.py +133 -0
  20. mlebench/competitions/alaska2-image-steganalysis/grade.py +136 -0
  21. mlebench/competitions/alaska2-image-steganalysis/prepare.py +88 -0
  22. mlebench/competitions/alaska2-image-steganalysis/prepare_val.py +148 -0
  23. mlebench/competitions/aptos2019-blindness-detection/grade.py +35 -0
  24. mlebench/competitions/aptos2019-blindness-detection/prepare.py +75 -0
  25. mlebench/competitions/aptos2019-blindness-detection/prepare_val.py +123 -0
  26. mlebench/competitions/bike-sharing-demand/__init__.py +0 -0
  27. mlebench/competitions/bike-sharing-demand/grade.py +55 -0
  28. mlebench/competitions/bike-sharing-demand/prepare.py +37 -0
  29. mlebench/competitions/billion-word-imputation/grade.py +37 -0
  30. mlebench/competitions/billion-word-imputation/prepare.py +107 -0
  31. mlebench/competitions/billion-word-imputation/prepare_val.py +179 -0
  32. mlebench/competitions/bms-molecular-translation/grade.py +40 -0
  33. mlebench/competitions/bms-molecular-translation/prepare.py +68 -0
  34. mlebench/competitions/bms-molecular-translation/prepare_val.py +131 -0
  35. mlebench/competitions/cassava-leaf-disease-classification/grade.py +12 -0
  36. mlebench/competitions/cassava-leaf-disease-classification/prepare.py +113 -0
  37. mlebench/competitions/cassava-leaf-disease-classification/prepare_val.py +186 -0
  38. mlebench/competitions/cdiscount-image-classification-challenge/grade.py +11 -0
  39. mlebench/competitions/cdiscount-image-classification-challenge/prepare.py +144 -0
  40. mlebench/competitions/cdiscount-image-classification-challenge/prepare_val.py +205 -0
  41. mlebench/competitions/chaii-hindi-and-tamil-question-answering/grade.py +67 -0
  42. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare.py +31 -0
  43. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare_val.py +94 -0
  44. mlebench/competitions/champs-scalar-coupling/grade.py +60 -0
  45. mlebench/competitions/champs-scalar-coupling/prepare.py +116 -0
  46. mlebench/competitions/champs-scalar-coupling/prepare_val.py +155 -0
  47. mlebench/competitions/conways-reverse-game-of-life-2020/__init__.py +0 -0
  48. mlebench/competitions/conways-reverse-game-of-life-2020/grade.py +40 -0
  49. mlebench/competitions/conways-reverse-game-of-life-2020/prepare.py +41 -0
  50. mlebench/competitions/demand-forecasting-kernels-only/__init__.py +0 -0
  51. mlebench/competitions/demand-forecasting-kernels-only/grade.py +66 -0
  52. mlebench/competitions/demand-forecasting-kernels-only/prepare.py +27 -0
  53. mlebench/competitions/demand_forecasting_kernels_only/__init__.py +0 -0
  54. mlebench/competitions/demand_forecasting_kernels_only/grade.py +66 -0
  55. mlebench/competitions/demand_forecasting_kernels_only/prepare.py +27 -0
  56. mlebench/competitions/denoising-dirty-documents/grade.py +44 -0
  57. mlebench/competitions/denoising-dirty-documents/prepare.py +134 -0
  58. mlebench/competitions/denoising-dirty-documents/prepare_val.py +178 -0
  59. mlebench/competitions/detecting-insults-in-social-commentary/grade.py +11 -0
  60. mlebench/competitions/detecting-insults-in-social-commentary/prepare.py +72 -0
  61. mlebench/competitions/detecting-insults-in-social-commentary/prepare_val.py +128 -0
  62. mlebench/competitions/dog-breed-identification/dogs.py +124 -0
  63. mlebench/competitions/dog-breed-identification/grade.py +42 -0
  64. mlebench/competitions/dog-breed-identification/prepare.py +55 -0
  65. mlebench/competitions/dog-breed-identification/prepare_val.py +104 -0
  66. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/grade.py +43 -0
  67. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare.py +70 -0
  68. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare_val.py +143 -0
  69. mlebench/competitions/ethanol-concentration/grade.py +23 -0
  70. mlebench/competitions/ethanol-concentration/prepare.py +90 -0
  71. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/grade.py +60 -0
  72. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare.py +41 -0
  73. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare_val.py +92 -0
  74. mlebench/competitions/feedback-prize-english-language-learning/__init__.py +0 -0
  75. mlebench/competitions/feedback-prize-english-language-learning/grade.py +60 -0
  76. mlebench/competitions/feedback-prize-english-language-learning/prepare.py +39 -0
  77. mlebench/competitions/freesound-audio-tagging-2019/grade.py +64 -0
  78. mlebench/competitions/freesound-audio-tagging-2019/prepare.py +94 -0
  79. mlebench/competitions/freesound-audio-tagging-2019/prepare_val.py +175 -0
  80. mlebench/competitions/freesound-audio-tagging-2019/vocabulary.py +83 -0
  81. mlebench/competitions/google-quest-challenge/classes.py +32 -0
  82. mlebench/competitions/google-quest-challenge/grade.py +45 -0
  83. mlebench/competitions/google-quest-challenge/prepare.py +58 -0
  84. mlebench/competitions/google-quest-challenge/prepare_val.py +120 -0
  85. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/grade.py +77 -0
  86. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare.py +155 -0
  87. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare_val.py +211 -0
  88. mlebench/competitions/h-and-m-personalized-fashion-recommendations/grade.py +42 -0
  89. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare.py +102 -0
  90. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare_val.py +132 -0
  91. mlebench/competitions/handwriting/grade.py +23 -0
  92. mlebench/competitions/handwriting/prepare.py +179 -0
  93. mlebench/competitions/herbarium-2020-fgvc7/grade.py +34 -0
  94. mlebench/competitions/herbarium-2020-fgvc7/prepare.py +251 -0
  95. mlebench/competitions/herbarium-2020-fgvc7/prepare_val.py +242 -0
  96. mlebench/competitions/herbarium-2021-fgvc8/grade.py +34 -0
  97. mlebench/competitions/herbarium-2021-fgvc8/prepare.py +251 -0
  98. mlebench/competitions/herbarium-2021-fgvc8/prepare_val.py +222 -0
  99. mlebench/competitions/herbarium-2022-fgvc9/grade.py +31 -0
  100. mlebench/competitions/herbarium-2022-fgvc9/prepare.py +233 -0
  101. mlebench/competitions/herbarium-2022-fgvc9/prepare_val.py +213 -0
  102. mlebench/competitions/histopathologic-cancer-detection/grade.py +12 -0
  103. mlebench/competitions/histopathologic-cancer-detection/prepare.py +59 -0
  104. mlebench/competitions/histopathologic-cancer-detection/prepare_val.py +131 -0
  105. mlebench/competitions/hms-harmful-brain-activity-classification/constants.py +9 -0
  106. mlebench/competitions/hms-harmful-brain-activity-classification/grade.py +43 -0
  107. mlebench/competitions/hms-harmful-brain-activity-classification/kaggle_metric_utilities.py +96 -0
  108. mlebench/competitions/hms-harmful-brain-activity-classification/kullback_leibler_divergence.py +118 -0
  109. mlebench/competitions/hms-harmful-brain-activity-classification/prepare.py +121 -0
  110. mlebench/competitions/hms-harmful-brain-activity-classification/prepare_val.py +190 -0
  111. mlebench/competitions/hotel-id-2021-fgvc8/grade.py +41 -0
  112. mlebench/competitions/hotel-id-2021-fgvc8/prepare.py +63 -0
  113. mlebench/competitions/hotel-id-2021-fgvc8/prepare_val.py +132 -0
  114. mlebench/competitions/hubmap-kidney-segmentation/grade.py +62 -0
  115. mlebench/competitions/hubmap-kidney-segmentation/prepare.py +108 -0
  116. mlebench/competitions/hubmap-kidney-segmentation/prepare_val.py +153 -0
  117. mlebench/competitions/icecube-neutrinos-in-deep-ice/grade.py +111 -0
  118. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare.py +127 -0
  119. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare_val.py +183 -0
  120. mlebench/competitions/ili/grade.py +60 -0
  121. mlebench/competitions/ili/prepare.py +99 -0
  122. mlebench/competitions/imet-2020-fgvc7/grade.py +54 -0
  123. mlebench/competitions/imet-2020-fgvc7/prepare.py +77 -0
  124. mlebench/competitions/imet-2020-fgvc7/prepare_val.py +157 -0
  125. mlebench/competitions/inaturalist-2019-fgvc6/grade.py +35 -0
  126. mlebench/competitions/inaturalist-2019-fgvc6/prepare.py +259 -0
  127. mlebench/competitions/inaturalist-2019-fgvc6/prepare_val.py +304 -0
  128. mlebench/competitions/instant-gratification/__init__.py +0 -0
  129. mlebench/competitions/instant-gratification/grade.py +55 -0
  130. mlebench/competitions/instant-gratification/prepare.py +25 -0
  131. mlebench/competitions/instant_gratification/__init__.py +0 -0
  132. mlebench/competitions/instant_gratification/grade.py +55 -0
  133. mlebench/competitions/instant_gratification/prepare.py +25 -0
  134. mlebench/competitions/invasive-species-monitoring/grade.py +11 -0
  135. mlebench/competitions/invasive-species-monitoring/prepare.py +97 -0
  136. mlebench/competitions/invasive-species-monitoring/prepare_val.py +164 -0
  137. mlebench/competitions/iwildcam-2019-fgvc6/grade.py +44 -0
  138. mlebench/competitions/iwildcam-2019-fgvc6/prepare.py +118 -0
  139. mlebench/competitions/iwildcam-2019-fgvc6/prepare_val.py +194 -0
  140. mlebench/competitions/iwildcam-2020-fgvc7/grade.py +11 -0
  141. mlebench/competitions/iwildcam-2020-fgvc7/prepare.py +164 -0
  142. mlebench/competitions/iwildcam-2020-fgvc7/prepare_val.py +245 -0
  143. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/classes.py +1 -0
  144. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/grade.py +54 -0
  145. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare.py +42 -0
  146. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare_val.py +88 -0
  147. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/grade.py +153 -0
  148. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare.py +36 -0
  149. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare_val.py +117 -0
  150. mlebench/competitions/kuzushiji-recognition/grade.py +58 -0
  151. mlebench/competitions/kuzushiji-recognition/kuzushiji_metric.py +118 -0
  152. mlebench/competitions/kuzushiji-recognition/prepare.py +92 -0
  153. mlebench/competitions/kuzushiji-recognition/prepare_val.py +149 -0
  154. mlebench/competitions/leaf-classification/classes.py +101 -0
  155. mlebench/competitions/leaf-classification/grade.py +44 -0
  156. mlebench/competitions/leaf-classification/prepare.py +60 -0
  157. mlebench/competitions/leaf-classification/prepare_val.py +116 -0
  158. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/grade.py +44 -0
  159. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare.py +51 -0
  160. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare_val.py +96 -0
  161. mlebench/competitions/liverpool-ion-switching/__init__.py +0 -0
  162. mlebench/competitions/liverpool-ion-switching/grade.py +52 -0
  163. mlebench/competitions/liverpool-ion-switching/prepare.py +27 -0
  164. mlebench/competitions/liverpool_ion_switching/__init__.py +0 -0
  165. mlebench/competitions/liverpool_ion_switching/grade.py +52 -0
  166. mlebench/competitions/liverpool_ion_switching/prepare.py +27 -0
  167. mlebench/competitions/lmsys-chatbot-arena/grade.py +63 -0
  168. mlebench/competitions/lmsys-chatbot-arena/prepare.py +52 -0
  169. mlebench/competitions/lmsys-chatbot-arena/prepare_val.py +115 -0
  170. mlebench/competitions/mcm_2024_c_test/grade.py +107 -0
  171. mlebench/competitions/mcm_2024_c_test/prepare.py +2 -0
  172. mlebench/competitions/ml2021spring-hw2/grade.py +11 -0
  173. mlebench/competitions/ml2021spring-hw2/prepare.py +58 -0
  174. mlebench/competitions/ml2021spring-hw2/prepare_val.py +135 -0
  175. mlebench/competitions/mlsp-2013-birds/grade.py +11 -0
  176. mlebench/competitions/mlsp-2013-birds/prepare.py +182 -0
  177. mlebench/competitions/mlsp-2013-birds/prepare_val.py +241 -0
  178. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/grade.py +11 -0
  179. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare.py +58 -0
  180. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare_val.py +120 -0
  181. mlebench/competitions/multi-modal-gesture-recognition/grade.py +58 -0
  182. mlebench/competitions/multi-modal-gesture-recognition/prepare.py +85 -0
  183. mlebench/competitions/multi-modal-gesture-recognition/prepare_val.py +139 -0
  184. mlebench/competitions/my-custom-task-01/prepare.py +2 -0
  185. mlebench/competitions/new-my-task-01/prepare.py +2 -0
  186. mlebench/competitions/new-my-task-03/grade.py +107 -0
  187. mlebench/competitions/new-my-task-03/prepare.py +2 -0
  188. mlebench/competitions/new-york-city-taxi-fare-prediction/grade.py +28 -0
  189. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare.py +44 -0
  190. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare_val.py +89 -0
  191. mlebench/competitions/nfl-player-contact-detection/grade.py +36 -0
  192. mlebench/competitions/nfl-player-contact-detection/prepare.py +101 -0
  193. mlebench/competitions/nfl-player-contact-detection/prepare_val.py +186 -0
  194. mlebench/competitions/nomad2018-predict-transparent-conductors/grade.py +47 -0
  195. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare.py +77 -0
  196. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare_val.py +144 -0
  197. mlebench/competitions/osic-pulmonary-fibrosis-progression/grade.py +74 -0
  198. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare.py +95 -0
  199. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare_val.py +167 -0
  200. mlebench/competitions/paddy-disease-classification/grade.py +35 -0
  201. mlebench/competitions/paddy-disease-classification/prepare.py +69 -0
  202. mlebench/competitions/paddy-disease-classification/prepare_val.py +122 -0
  203. mlebench/competitions/petfinder-pawpularity-score/grade.py +41 -0
  204. mlebench/competitions/petfinder-pawpularity-score/prepare.py +76 -0
  205. mlebench/competitions/petfinder-pawpularity-score/prepare_val.py +154 -0
  206. mlebench/competitions/plant-pathology-2020-fgvc7/grade.py +41 -0
  207. mlebench/competitions/plant-pathology-2020-fgvc7/prepare.py +74 -0
  208. mlebench/competitions/plant-pathology-2020-fgvc7/prepare_val.py +160 -0
  209. mlebench/competitions/plant-pathology-2021-fgvc8/grade.py +54 -0
  210. mlebench/competitions/plant-pathology-2021-fgvc8/prepare.py +65 -0
  211. mlebench/competitions/plant-pathology-2021-fgvc8/prepare_val.py +130 -0
  212. mlebench/competitions/plant-seedlings-classification/grade.py +39 -0
  213. mlebench/competitions/plant-seedlings-classification/prepare.py +91 -0
  214. mlebench/competitions/plant-seedlings-classification/prepare_val.py +158 -0
  215. mlebench/competitions/playground-series-s3e1/__init__.py +0 -0
  216. mlebench/competitions/playground-series-s3e1/grade.py +52 -0
  217. mlebench/competitions/playground-series-s3e1/prepare.py +25 -0
  218. mlebench/competitions/playground-series-s3e11/__init__.py +0 -0
  219. mlebench/competitions/playground-series-s3e11/grade.py +55 -0
  220. mlebench/competitions/playground-series-s3e11/prepare.py +25 -0
  221. mlebench/competitions/playground-series-s3e18/grade.py +39 -0
  222. mlebench/competitions/playground-series-s3e18/prepare.py +36 -0
  223. mlebench/competitions/playground-series-s3e18/prepare_val.py +89 -0
  224. mlebench/competitions/playground_series_s3e1/__init__.py +0 -0
  225. mlebench/competitions/playground_series_s3e1/grade.py +52 -0
  226. mlebench/competitions/playground_series_s3e1/prepare.py +25 -0
  227. mlebench/competitions/playground_series_s3e11/__init__.py +0 -0
  228. mlebench/competitions/playground_series_s3e11/grade.py +55 -0
  229. mlebench/competitions/playground_series_s3e11/prepare.py +25 -0
  230. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/grade.py +44 -0
  231. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare.py +68 -0
  232. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare_val.py +146 -0
  233. mlebench/competitions/random-acts-of-pizza/grade.py +14 -0
  234. mlebench/competitions/random-acts-of-pizza/prepare.py +80 -0
  235. mlebench/competitions/random-acts-of-pizza/prepare_val.py +144 -0
  236. mlebench/competitions/ranzcr-clip-catheter-line-classification/classes.py +11 -0
  237. mlebench/competitions/ranzcr-clip-catheter-line-classification/grade.py +31 -0
  238. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare.py +53 -0
  239. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare_val.py +113 -0
  240. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/grade.py +124 -0
  241. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare.py +219 -0
  242. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare_val.py +257 -0
  243. mlebench/competitions/rsna-breast-cancer-detection/grade.py +65 -0
  244. mlebench/competitions/rsna-breast-cancer-detection/prepare.py +141 -0
  245. mlebench/competitions/rsna-breast-cancer-detection/prepare_val.py +201 -0
  246. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/grade.py +13 -0
  247. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare.py +47 -0
  248. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare_val.py +97 -0
  249. mlebench/competitions/santander-customer-satisfaction/grade.py +10 -0
  250. mlebench/competitions/santander-customer-satisfaction/prepare.py +41 -0
  251. mlebench/competitions/sciencebench-001-clintox-nn/__init__.py +0 -0
  252. mlebench/competitions/sciencebench-001-clintox-nn/grade.py +56 -0
  253. mlebench/competitions/sciencebench-001-clintox-nn/prepare.py +75 -0
  254. mlebench/competitions/sciencebench-015-aai/grade.py +37 -0
  255. mlebench/competitions/sciencebench-015-aai/prepare.py +102 -0
  256. mlebench/competitions/sciencebench-051-brain-blood-qsar/grade.py +58 -0
  257. mlebench/competitions/sciencebench-051-brain-blood-qsar/prepare.py +69 -0
  258. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/grade.py +55 -0
  259. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/prepare.py +88 -0
  260. mlebench/competitions/see-click-predict-fix/__init__.py +0 -0
  261. mlebench/competitions/see-click-predict-fix/grade.py +66 -0
  262. mlebench/competitions/see-click-predict-fix/prepare.py +25 -0
  263. mlebench/competitions/see_click_predict_fix/__init__.py +0 -0
  264. mlebench/competitions/see_click_predict_fix/grade.py +66 -0
  265. mlebench/competitions/see_click_predict_fix/prepare.py +25 -0
  266. mlebench/competitions/seti-breakthrough-listen/grade.py +11 -0
  267. mlebench/competitions/seti-breakthrough-listen/prepare.py +71 -0
  268. mlebench/competitions/seti-breakthrough-listen/prepare_val.py +159 -0
  269. mlebench/competitions/siim-covid19-detection/grade.py +194 -0
  270. mlebench/competitions/siim-covid19-detection/prepare.py +123 -0
  271. mlebench/competitions/siim-covid19-detection/prepare_val.py +164 -0
  272. mlebench/competitions/siim-isic-melanoma-classification/grade.py +11 -0
  273. mlebench/competitions/siim-isic-melanoma-classification/prepare.py +127 -0
  274. mlebench/competitions/siim-isic-melanoma-classification/prepare_val.py +158 -0
  275. mlebench/competitions/smartphone-decimeter-2022/grade.py +55 -0
  276. mlebench/competitions/smartphone-decimeter-2022/notebook.py +86 -0
  277. mlebench/competitions/smartphone-decimeter-2022/prepare.py +143 -0
  278. mlebench/competitions/smartphone-decimeter-2022/prepare_val.py +199 -0
  279. mlebench/competitions/spaceship-titanic/grade.py +11 -0
  280. mlebench/competitions/spaceship-titanic/prepare.py +23 -0
  281. mlebench/competitions/spaceship-titanic/prepare_val.py +61 -0
  282. mlebench/competitions/spooky-author-identification/classes.py +1 -0
  283. mlebench/competitions/spooky-author-identification/grade.py +38 -0
  284. mlebench/competitions/spooky-author-identification/prepare.py +40 -0
  285. mlebench/competitions/spooky-author-identification/prepare_val.py +78 -0
  286. mlebench/competitions/stanford-covid-vaccine/grade.py +65 -0
  287. mlebench/competitions/stanford-covid-vaccine/prepare.py +129 -0
  288. mlebench/competitions/stanford-covid-vaccine/prepare_val.py +199 -0
  289. mlebench/competitions/statoil-iceberg-classifier-challenge/grade.py +41 -0
  290. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare.py +105 -0
  291. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare_val.py +157 -0
  292. mlebench/competitions/tabular-playground-series-dec-2021/grade.py +11 -0
  293. mlebench/competitions/tabular-playground-series-dec-2021/prepare.py +39 -0
  294. mlebench/competitions/tabular-playground-series-dec-2021/prepare_val.py +99 -0
  295. mlebench/competitions/tabular-playground-series-may-2022/grade.py +9 -0
  296. mlebench/competitions/tabular-playground-series-may-2022/prepare.py +56 -0
  297. mlebench/competitions/tabular-playground-series-may-2022/prepare_val.py +116 -0
  298. mlebench/competitions/tensorflow-speech-recognition-challenge/grade.py +11 -0
  299. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare.py +90 -0
  300. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare_val.py +148 -0
  301. mlebench/competitions/tensorflow2-question-answering/grade.py +122 -0
  302. mlebench/competitions/tensorflow2-question-answering/prepare.py +122 -0
  303. mlebench/competitions/tensorflow2-question-answering/prepare_val.py +187 -0
  304. mlebench/competitions/text-normalization-challenge-english-language/grade.py +49 -0
  305. mlebench/competitions/text-normalization-challenge-english-language/prepare.py +115 -0
  306. mlebench/competitions/text-normalization-challenge-english-language/prepare_val.py +213 -0
  307. mlebench/competitions/text-normalization-challenge-russian-language/grade.py +49 -0
  308. mlebench/competitions/text-normalization-challenge-russian-language/prepare.py +113 -0
  309. mlebench/competitions/text-normalization-challenge-russian-language/prepare_val.py +165 -0
  310. mlebench/competitions/tgs-salt-identification-challenge/grade.py +144 -0
  311. mlebench/competitions/tgs-salt-identification-challenge/prepare.py +158 -0
  312. mlebench/competitions/tgs-salt-identification-challenge/prepare_val.py +166 -0
  313. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/grade.py +11 -0
  314. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare.py +95 -0
  315. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare_val.py +141 -0
  316. mlebench/competitions/tmdb-box-office-prediction/__init__.py +0 -0
  317. mlebench/competitions/tmdb-box-office-prediction/grade.py +55 -0
  318. mlebench/competitions/tmdb-box-office-prediction/prepare.py +35 -0
  319. mlebench/competitions/tweet-sentiment-extraction/grade.py +67 -0
  320. mlebench/competitions/tweet-sentiment-extraction/prepare.py +36 -0
  321. mlebench/competitions/tweet-sentiment-extraction/prepare_val.py +106 -0
  322. mlebench/competitions/us-patent-phrase-to-phrase-matching/grade.py +31 -0
  323. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare.py +33 -0
  324. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare_val.py +71 -0
  325. mlebench/competitions/utils.py +266 -0
  326. mlebench/competitions/uw-madison-gi-tract-image-segmentation/grade.py +158 -0
  327. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare.py +139 -0
  328. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare_val.py +193 -0
  329. mlebench/competitions/ventilator-pressure-prediction/__init__.py +0 -0
  330. mlebench/competitions/ventilator-pressure-prediction/grade.py +52 -0
  331. mlebench/competitions/ventilator-pressure-prediction/prepare.py +27 -0
  332. mlebench/competitions/ventilator-pressure-prediction/prepare_val.py +142 -0
  333. mlebench/competitions/ventilator_pressure_prediction/__init__.py +0 -0
  334. mlebench/competitions/ventilator_pressure_prediction/grade.py +52 -0
  335. mlebench/competitions/ventilator_pressure_prediction/prepare.py +27 -0
  336. mlebench/competitions/vesuvius-challenge-ink-detection/grade.py +97 -0
  337. mlebench/competitions/vesuvius-challenge-ink-detection/prepare.py +122 -0
  338. mlebench/competitions/vesuvius-challenge-ink-detection/prepare_val.py +170 -0
  339. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/grade.py +220 -0
  340. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare.py +129 -0
  341. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare_val.py +204 -0
  342. mlebench/competitions/whale-categorization-playground/grade.py +41 -0
  343. mlebench/competitions/whale-categorization-playground/prepare.py +103 -0
  344. mlebench/competitions/whale-categorization-playground/prepare_val.py +196 -0
  345. mlebench/data.py +420 -0
  346. mlebench/grade.py +209 -0
  347. mlebench/grade_helpers.py +235 -0
  348. mlebench/metrics.py +75 -0
  349. mlebench/registry.py +332 -0
  350. mlebench/utils.py +346 -0
  351. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/WHEEL +0 -0
  352. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,135 @@
1
+ from pathlib import Path
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ from sklearn.model_selection import train_test_split
6
+
7
+
8
+ def _split_and_save(
9
+ input_features: np.ndarray,
10
+ input_labels: np.ndarray,
11
+ test_proportion: float,
12
+ random_state: int,
13
+ output_public_path: Path,
14
+ output_private_path: Path,
15
+ ) -> tuple[np.ndarray, np.ndarray]:
16
+ """
17
+ Helper function to perform a split, save files to specified paths, and return the new training set.
18
+
19
+ This function encapsulates the logic for:
20
+ 1. Splitting input data into training and testing sets.
21
+ 2. Creating the necessary directory structure.
22
+ 3. Saving the training data, (unlabeled) test data, and sample submission to the public path.
23
+ 4. Saving the ground-truth test labels to the private path.
24
+ 5. Performing sanity checks on the created files.
25
+
26
+ Args:
27
+ input_features: The feature data to be split.
28
+ input_labels: The corresponding labels for the feature data.
29
+ test_proportion: The proportion of the dataset to allocate to the test split.
30
+ random_state: The seed used by the random number generator for reproducibility.
31
+ output_public_path: The base directory for public-facing files.
32
+ output_private_path: The base directory for private/solution files.
33
+
34
+ Returns:
35
+ A tuple containing the features and labels of the newly created training set,
36
+ which can be used for subsequent splits.
37
+ """
38
+ input_idxs = range(len(input_features))
39
+
40
+ # Create new splits
41
+ new_train_idxs, new_test_idxs = train_test_split(
42
+ input_idxs, test_size=test_proportion, random_state=random_state
43
+ )
44
+
45
+ new_train = input_features[new_train_idxs]
46
+ new_train_label = input_labels[new_train_idxs]
47
+ new_test = input_features[new_test_idxs]
48
+ new_test_label = input_labels[new_test_idxs]
49
+
50
+ answers_df = pd.DataFrame({"Id": range(len(new_test)), "ClassId": new_test_label})
51
+
52
+ # Create sample submission
53
+ sample_submission = answers_df.copy()
54
+ sample_submission["ClassId"] = 0
55
+
56
+ # Create directories
57
+ (output_public_path / "timit_11" / "timit_11").mkdir(parents=True, exist_ok=True)
58
+ output_private_path.mkdir(parents=True, exist_ok=True)
59
+
60
+
61
+ # Save files
62
+ np.save(output_public_path / "timit_11" / "timit_11" / "train_11.npy", new_train)
63
+ np.save(output_public_path / "timit_11" / "timit_11" / "train_label_11.npy", new_train_label)
64
+ np.save(output_public_path / "timit_11" / "timit_11" / "test_11.npy", new_test)
65
+ sample_submission.to_csv(output_public_path / "sampleSubmission.csv", index=False)
66
+ answers_df.to_csv(output_private_path / "answers.csv", index=False)
67
+
68
+ # Sanity checks
69
+ assert (
70
+ output_public_path / "timit_11" / "timit_11" / "train_11.npy"
71
+ ).exists(), f"`train_11.npy` doesn't exist in {output_public_path}!"
72
+ assert (
73
+ output_public_path / "timit_11" / "timit_11" / "train_label_11.npy"
74
+ ).exists(), f"`train_label_11.npy` doesn't exist in {output_public_path}!"
75
+ assert (
76
+ output_public_path / "timit_11" / "timit_11" / "test_11.npy"
77
+ ).exists(), f"`test_11.npy` doesn't exist in {output_public_path}!"
78
+ assert (
79
+ output_public_path / "sampleSubmission.csv"
80
+ ).exists(), f"`sampleSubmission.csv` doesn't exist in {output_public_path}!"
81
+ assert (
82
+ output_private_path / "answers.csv"
83
+ ).exists(), f"`answers.csv` doesn't exist in {output_private_path}!"
84
+
85
+ assert len(new_train) + len(new_test) == len(
86
+ input_features
87
+ ), f"Expected {len(input_features)} samples in combined new train and test splits, got {len(new_train) + len(new_test)}!"
88
+
89
+ # Return the new training set for potential further splitting
90
+ return new_train, new_train_label
91
+
92
+
93
+ def prepare(raw: Path, public: Path, private: Path):
94
+ """
95
+ Create a primary train/test split and a secondary train/validation split.
96
+ The primary split results are saved to `public` and `private` directories.
97
+ The secondary split results are saved to parallel `public_val` and `private_val` directories.
98
+ """
99
+ # Load original data from raw directory
100
+ old_train = np.load(raw / "timit_11" / "timit_11" / "train_11.npy")
101
+ old_train_label = np.load(raw / "timit_11" / "timit_11" / "train_label_11.npy")
102
+
103
+ # Determine the test set proportion from the original competition data
104
+ old_test = np.load(raw / "timit_11" / "timit_11" / "test_11.npy")
105
+ old_test_prop = len(old_test) / (len(old_train) + len(old_test)) # ~= 0.268
106
+ RANDOM_STATE = 0
107
+
108
+ # --- First Split: Create Original public/private datasets ---
109
+ # This call generates the exact same output as the original script.
110
+ # It returns the features/labels of its training set, which we will use for the next split.
111
+ train_for_val_split, train_label_for_val_split = _split_and_save(
112
+ input_features=old_train,
113
+ input_labels=old_train_label,
114
+ test_proportion=old_test_prop,
115
+ random_state=RANDOM_STATE,
116
+ output_public_path=public,
117
+ output_private_path=private,
118
+ )
119
+
120
+ # --- Second Split: Create New public_val/private_val datasets ---
121
+ # Define the new parallel directories for the validation set
122
+ public_val = public.parent / "public_val"
123
+ private_val = private.parent / "private_val"
124
+
125
+ # Call the helper again, this time splitting the data from the *first* split's training set.
126
+ # This creates the new training set (train_val) and validation set (test_val).
127
+ # The splitting logic, proportion, and random state are identical to the first split.
128
+ _split_and_save(
129
+ input_features=train_for_val_split,
130
+ input_labels=train_label_for_val_split,
131
+ test_proportion=old_test_prop,
132
+ random_state=RANDOM_STATE,
133
+ output_public_path=public_val,
134
+ output_private_path=private_val,
135
+ )
@@ -0,0 +1,11 @@
1
+ import pandas as pd
2
+ from sklearn.metrics import roc_auc_score
3
+
4
+ from mlebench.competitions.utils import prepare_for_auroc_metric
5
+
6
+
7
+ def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
8
+ roc_auc_inputs = prepare_for_auroc_metric(
9
+ submission=submission, answers=answers, id_col="Id", target_col="Probability"
10
+ )
11
+ return roc_auc_score(y_true=roc_auc_inputs["y_true"], y_score=roc_auc_inputs["y_score"])
@@ -0,0 +1,182 @@
1
+ import os
2
+ import shutil
3
+ from pathlib import Path
4
+ from typing import Dict
5
+
6
+ import pandas as pd
7
+
8
+ from mlebench.utils import extract, read_csv
9
+
10
+
11
+ def filter_and_write_file(src: Path, dst: Path, old_id_to_new: Dict[int, int]):
12
+ """
13
+ Given txt file that has column 0 as rec_id, filters out rec_ids that are not in old_id_to_new and writes to dst
14
+ """
15
+ history_of_segments = open(src).read().splitlines()
16
+ history_of_segments = history_of_segments[1:]
17
+ history_of_segments = [
18
+ (int(i.split(",")[0]), ",".join(i.split(",")[1:])) for i in history_of_segments
19
+ ]
20
+ history_of_segments = [
21
+ (old_id_to_new[i[0]], i[1]) for i in history_of_segments if i[0] in old_id_to_new.keys()
22
+ ]
23
+ with open(dst, "w") as f:
24
+ f.write("rec_id,[histogram of segment features]\n")
25
+ for rec_id, labels in history_of_segments:
26
+ f.write(f"{rec_id},{labels}\n")
27
+
28
+
29
+ def prepare(raw: Path, public: Path, private: Path):
30
+ """
31
+ Splits the data in raw into public and private datasets with appropriate test/train splits.
32
+ """
33
+ # extract only what we need
34
+ extract(raw / "mlsp_contest_dataset.zip", raw)
35
+
36
+ (public / "essential_data").mkdir(exist_ok=True)
37
+ (public / "supplemental_data").mkdir(exist_ok=True)
38
+
39
+ # Create train, test from train split
40
+ cv_folds = read_csv(raw / "mlsp_contest_dataset/essential_data/CVfolds_2.txt")
41
+ cv_folds = cv_folds[cv_folds["fold"] == 0].reset_index(drop=True)
42
+ cv_folds.loc[cv_folds.sample(frac=0.2, random_state=0).index, "fold"] = 1
43
+
44
+ old_id_to_new = {old_id: new_id for new_id, old_id in enumerate(cv_folds["rec_id"].values)}
45
+ cv_folds["rec_id"] = cv_folds.index
46
+ cv_folds.to_csv(public / "essential_data/CVfolds_2.txt", index=False)
47
+
48
+ test_rec_ids = cv_folds[cv_folds["fold"] == 1]["rec_id"].values
49
+ assert len(test_rec_ids) == 64, f"Expected 64 test rec_ids, got {len(test_rec_ids)}"
50
+
51
+ # Update id2filename with new split
52
+ rec_id2filename = read_csv(raw / "mlsp_contest_dataset/essential_data/rec_id2filename.txt")
53
+ rec_id2filename = rec_id2filename[rec_id2filename["rec_id"].isin(old_id_to_new.keys())]
54
+ rec_id2filename["rec_id"] = rec_id2filename["rec_id"].map(old_id_to_new)
55
+ rec_id2filename.to_csv(public / "essential_data/rec_id2filename.txt", index=False)
56
+ assert len(rec_id2filename) == len(
57
+ cv_folds
58
+ ), f"Expected {len(cv_folds)} entires in rec_id2filename, got {len(rec_id2filename)}"
59
+
60
+ # Update labels with new split
61
+ rec_labels = (
62
+ open(raw / "mlsp_contest_dataset/essential_data/rec_labels_test_hidden.txt")
63
+ .read()
64
+ .splitlines()
65
+ )
66
+ rec_labels = rec_labels[1:] # Ignore header line
67
+ rec_labels_split = []
68
+ for i in rec_labels:
69
+ rec_id = i.split(",")[0]
70
+ labels = ",".join(i.split(",")[1:]) if len(i.split(",")) > 1 else ""
71
+ rec_labels_split.append((int(rec_id), labels))
72
+ rec_labels_split = [i for i in rec_labels_split if i[0] in old_id_to_new.keys()]
73
+ rec_labels_split = [(old_id_to_new[i[0]], i[1]) for i in rec_labels_split]
74
+
75
+ # Public labels
76
+ with open(public / "essential_data/rec_labels_test_hidden.txt", "w") as f:
77
+ f.write("rec_id,[labels]\n")
78
+ for rec_id, labels in rec_labels_split:
79
+ if rec_id in test_rec_ids:
80
+ labels = "?"
81
+ if labels == "": # Write without comma
82
+ f.write(f"{rec_id}{labels}\n")
83
+ else:
84
+ f.write(f"{rec_id},{labels}\n")
85
+
86
+ # Private labels. Create csv, with each row containing the label for a (rec_id, species_id) pair
87
+ data = {"Id": [], "Probability": []}
88
+ for rec_id, labels in rec_labels_split:
89
+ if rec_id not in test_rec_ids:
90
+ continue
91
+ species_ids = [int(i) for i in labels.split(",") if i != ""]
92
+ for species_id in range(0, 19):
93
+ data["Id"].append(rec_id * 100 + species_id)
94
+ data["Probability"].append(int(species_id in species_ids))
95
+
96
+ pd.DataFrame(data).to_csv(private / "answers.csv", index=False)
97
+ assert (
98
+ len(pd.DataFrame(data)) == len(test_rec_ids) * 19
99
+ ), f"Expected {len(test_rec_ids)*19} entires in answers.csv, got {len(pd.DataFrame(data))}"
100
+
101
+ # Create new sample submission, following new submission format
102
+ # http://www.kaggle.com/c/mlsp-2013-birds/forums/t/4961/new-submission-parser
103
+ data = {
104
+ "Id": [rec_id * 100 + species_id for rec_id in test_rec_ids for species_id in range(0, 19)],
105
+ "Probability": 0,
106
+ }
107
+ pd.DataFrame(data).to_csv(public / "sample_submission.csv", index=False)
108
+ assert (
109
+ len(pd.DataFrame(data)) == len(test_rec_ids) * 19
110
+ ), f"Expected {len(test_rec_ids)*19} entires in sample_submission.csv, got {len(pd.DataFrame(data))}"
111
+
112
+ # Copy over species list
113
+ shutil.copyfile(
114
+ src=raw / "mlsp_contest_dataset/essential_data/species_list.txt",
115
+ dst=public / "essential_data/species_list.txt",
116
+ )
117
+
118
+ # Copy over all src waves from train+test set
119
+ (public / "essential_data/src_wavs").mkdir(exist_ok=True)
120
+ for filename in rec_id2filename["filename"]:
121
+ shutil.copyfile(
122
+ src=raw / "mlsp_contest_dataset/essential_data/src_wavs" / f"{filename}.wav",
123
+ dst=public / "essential_data/src_wavs" / f"{filename}.wav",
124
+ )
125
+
126
+ # Copy over train+test filtered spectrograms, segmentation examples, spectrograms, and supervised segmentation
127
+ (public / "supplemental_data/filtered_spectrograms").mkdir(exist_ok=True)
128
+ (public / "supplemental_data/segmentation_examples").mkdir(exist_ok=True)
129
+ (public / "supplemental_data/spectrograms").mkdir(exist_ok=True)
130
+ (public / "supplemental_data/supervised_segmentation").mkdir(exist_ok=True)
131
+ for filename in rec_id2filename["filename"]:
132
+ shutil.copyfile(
133
+ src=raw
134
+ / "mlsp_contest_dataset/supplemental_data/filtered_spectrograms"
135
+ / f"{filename}.bmp",
136
+ dst=public / "supplemental_data/filtered_spectrograms" / f"{filename}.bmp",
137
+ )
138
+ if os.path.exists(
139
+ raw / "mlsp_contest_dataset/supplemental_data/segmentation_examples" / f"{filename}.bmp"
140
+ ):
141
+ shutil.copyfile(
142
+ src=raw
143
+ / "mlsp_contest_dataset/supplemental_data/segmentation_examples"
144
+ / f"{filename}.bmp",
145
+ dst=public / "supplemental_data/segmentation_examples" / f"{filename}.bmp",
146
+ )
147
+ shutil.copyfile(
148
+ src=raw / "mlsp_contest_dataset/supplemental_data/spectrograms" / f"{filename}.bmp",
149
+ dst=public / "supplemental_data/spectrograms" / f"{filename}.bmp",
150
+ )
151
+ shutil.copyfile(
152
+ src=raw
153
+ / "mlsp_contest_dataset/supplemental_data/supervised_segmentation"
154
+ / f"{filename}.bmp",
155
+ dst=public / "supplemental_data/supervised_segmentation" / f"{filename}.bmp",
156
+ )
157
+
158
+ # Copy over remaining files
159
+ shutil.copyfile(
160
+ src=raw / "mlsp_contest_dataset/supplemental_data/segment_clusters.bmp",
161
+ dst=public / "supplemental_data/segment_clusters.bmp",
162
+ )
163
+ shutil.copyfile(
164
+ src=raw / "mlsp_contest_dataset/supplemental_data/segment_mosaic.bmp",
165
+ dst=public / "supplemental_data/segment_mosaic.bmp",
166
+ )
167
+
168
+ filter_and_write_file(
169
+ src=raw / "mlsp_contest_dataset/supplemental_data/histogram_of_segments.txt",
170
+ dst=public / "supplemental_data/histogram_of_segments.txt",
171
+ old_id_to_new=old_id_to_new,
172
+ )
173
+ filter_and_write_file(
174
+ src=raw / "mlsp_contest_dataset/supplemental_data/segment_features.txt",
175
+ dst=public / "supplemental_data/segment_features.txt",
176
+ old_id_to_new=old_id_to_new,
177
+ )
178
+ filter_and_write_file(
179
+ src=raw / "mlsp_contest_dataset/supplemental_data/segment_rectangles.txt",
180
+ dst=public / "supplemental_data/segment_rectangles.txt",
181
+ old_id_to_new=old_id_to_new,
182
+ )
@@ -0,0 +1,241 @@
1
+ import os
2
+ import shutil
3
+ from pathlib import Path
4
+ from typing import Dict, List, Tuple
5
+
6
+ import pandas as pd
7
+
8
+ from mlebench.utils import extract, read_csv
9
+
10
+
11
+ def filter_and_write_file(src: Path, dst: Path, old_id_to_new: Dict[int, int]):
12
+ """
13
+ Given txt file that has column 0 as rec_id, filters out rec_ids that are not in old_id_to_new and writes to dst
14
+ """
15
+ history_of_segments = open(src).read().splitlines()
16
+ history_of_segments = history_of_segments[1:]
17
+ history_of_segments = [
18
+ (int(i.split(",")[0]), ",".join(i.split(",")[1:])) for i in history_of_segments
19
+ ]
20
+ history_of_segments = [
21
+ (old_id_to_new[i[0]], i[1]) for i in history_of_segments if i[0] in old_id_to_new.keys()
22
+ ]
23
+ with open(dst, "w") as f:
24
+ f.write("rec_id,[histogram of segment features]\n")
25
+ for rec_id, labels in history_of_segments:
26
+ f.write(f"{rec_id},{labels}\n")
27
+
28
+
29
+ def _process_split(
30
+ cv_folds: pd.DataFrame,
31
+ full_rec_id2filename: pd.DataFrame,
32
+ full_rec_labels_split: List[Tuple[int, str]],
33
+ raw_path: Path,
34
+ public_path: Path,
35
+ private_path: Path,
36
+ ):
37
+ """
38
+ Helper function to process a single data split and generate corresponding public and private files.
39
+ """
40
+ public_path.mkdir(exist_ok=True, parents=True)
41
+ private_path.mkdir(exist_ok=True, parents=True)
42
+ (public_path / "essential_data").mkdir(exist_ok=True)
43
+ (public_path / "supplemental_data").mkdir(exist_ok=True)
44
+
45
+ # Create a new compact ID mapping for the current subset of data
46
+ old_ids_in_split = cv_folds["rec_id"].values
47
+ old_id_to_new = {old_id: new_id for new_id, old_id in enumerate(old_ids_in_split)}
48
+ cv_folds_processed = cv_folds.copy()
49
+ cv_folds_processed["rec_id"] = cv_folds_processed["rec_id"].map(old_id_to_new)
50
+ cv_folds_processed.to_csv(public_path / "essential_data/CVfolds_2.txt", index=False)
51
+
52
+ test_rec_ids = cv_folds_processed[cv_folds_processed["fold"] == 1]["rec_id"].values
53
+
54
+ # Update id2filename with new split
55
+ rec_id2filename = full_rec_id2filename[
56
+ full_rec_id2filename["rec_id"].isin(old_id_to_new.keys())
57
+ ].copy()
58
+ rec_id2filename["rec_id"] = rec_id2filename["rec_id"].map(old_id_to_new)
59
+ rec_id2filename.to_csv(public_path / "essential_data/rec_id2filename.txt", index=False)
60
+ assert len(rec_id2filename) == len(
61
+ cv_folds_processed
62
+ ), f"Expected {len(cv_folds_processed)} entires in rec_id2filename, got {len(rec_id2filename)}"
63
+
64
+ # Update labels with new split
65
+ rec_labels_split = [i for i in full_rec_labels_split if i[0] in old_id_to_new.keys()]
66
+ rec_labels_split = [(old_id_to_new[i[0]], i[1]) for i in rec_labels_split]
67
+
68
+ # Public labels
69
+ with open(public_path / "essential_data/rec_labels_test_hidden.txt", "w") as f:
70
+ f.write("rec_id,[labels]\n")
71
+ for rec_id, labels in rec_labels_split:
72
+ if rec_id in test_rec_ids:
73
+ labels = "?"
74
+ if labels == "": # Write without comma
75
+ f.write(f"{rec_id}{labels}\n")
76
+ else:
77
+ f.write(f"{rec_id},{labels}\n")
78
+
79
+ # Private labels. Create csv, with each row containing the label for a (rec_id, species_id) pair
80
+ data = {"Id": [], "Probability": []}
81
+ for rec_id, labels in rec_labels_split:
82
+ if rec_id not in test_rec_ids:
83
+ continue
84
+ species_ids = [int(i) for i in labels.split(",") if i != ""]
85
+ for species_id in range(0, 19):
86
+ data["Id"].append(rec_id * 100 + species_id)
87
+ data["Probability"].append(int(species_id in species_ids))
88
+
89
+ pd.DataFrame(data).to_csv(private_path / "answers.csv", index=False)
90
+ if len(test_rec_ids) > 0:
91
+ assert (
92
+ len(pd.DataFrame(data)) == len(test_rec_ids) * 19
93
+ ), f"Expected {len(test_rec_ids)*19} entires in answers.csv, got {len(pd.DataFrame(data))}"
94
+
95
+ # Create new sample submission, following new submission format
96
+ data = {
97
+ "Id": [rec_id * 100 + species_id for rec_id in test_rec_ids for species_id in range(0, 19)],
98
+ "Probability": 0,
99
+ }
100
+ pd.DataFrame(data).to_csv(public_path / "sample_submission.csv", index=False)
101
+ if len(test_rec_ids) > 0:
102
+ assert (
103
+ len(pd.DataFrame(data)) == len(test_rec_ids) * 19
104
+ ), f"Expected {len(test_rec_ids)*19} entires in sample_submission.csv, got {len(pd.DataFrame(data))}"
105
+
106
+ # Copy over species list
107
+ shutil.copyfile(
108
+ src=raw_path / "mlsp_contest_dataset/essential_data/species_list.txt",
109
+ dst=public_path / "essential_data/species_list.txt",
110
+ )
111
+
112
+ # Copy over all src waves from train+test set
113
+ (public_path / "essential_data/src_wavs").mkdir(exist_ok=True)
114
+ for filename in rec_id2filename["filename"]:
115
+ shutil.copyfile(
116
+ src=raw_path / "mlsp_contest_dataset/essential_data/src_wavs" / f"{filename}.wav",
117
+ dst=public_path / "essential_data/src_wavs" / f"{filename}.wav",
118
+ )
119
+
120
+ # Copy over train+test filtered spectrograms, segmentation examples, spectrograms, and supervised segmentation
121
+ (public_path / "supplemental_data/filtered_spectrograms").mkdir(exist_ok=True)
122
+ (public_path / "supplemental_data/segmentation_examples").mkdir(exist_ok=True)
123
+ (public_path / "supplemental_data/spectrograms").mkdir(exist_ok=True)
124
+ (public_path / "supplemental_data/supervised_segmentation").mkdir(exist_ok=True)
125
+ for filename in rec_id2filename["filename"]:
126
+ shutil.copyfile(
127
+ src=raw_path
128
+ / "mlsp_contest_dataset/supplemental_data/filtered_spectrograms"
129
+ / f"{filename}.bmp",
130
+ dst=public_path / "supplemental_data/filtered_spectrograms" / f"{filename}.bmp",
131
+ )
132
+ if os.path.exists(
133
+ raw_path / "mlsp_contest_dataset/supplemental_data/segmentation_examples" / f"{filename}.bmp"
134
+ ):
135
+ shutil.copyfile(
136
+ src=raw_path
137
+ / "mlsp_contest_dataset/supplemental_data/segmentation_examples"
138
+ / f"{filename}.bmp",
139
+ dst=public_path / "supplemental_data/segmentation_examples" / f"{filename}.bmp",
140
+ )
141
+ shutil.copyfile(
142
+ src=raw_path / "mlsp_contest_dataset/supplemental_data/spectrograms" / f"{filename}.bmp",
143
+ dst=public_path / "supplemental_data/spectrograms" / f"{filename}.bmp",
144
+ )
145
+ shutil.copyfile(
146
+ src=raw_path
147
+ / "mlsp_contest_dataset/supplemental_data/supervised_segmentation"
148
+ / f"{filename}.bmp",
149
+ dst=public_path / "supplemental_data/supervised_segmentation" / f"{filename}.bmp",
150
+ )
151
+
152
+ # Copy over remaining files
153
+ shutil.copyfile(
154
+ src=raw_path / "mlsp_contest_dataset/supplemental_data/segment_clusters.bmp",
155
+ dst=public_path / "supplemental_data/segment_clusters.bmp",
156
+ )
157
+ shutil.copyfile(
158
+ src=raw_path / "mlsp_contest_dataset/supplemental_data/segment_mosaic.bmp",
159
+ dst=public_path / "supplemental_data/segment_mosaic.bmp",
160
+ )
161
+
162
+ filter_and_write_file(
163
+ src=raw_path / "mlsp_contest_dataset/supplemental_data/histogram_of_segments.txt",
164
+ dst=public_path / "supplemental_data/histogram_of_segments.txt",
165
+ old_id_to_new=old_id_to_new,
166
+ )
167
+ filter_and_write_file(
168
+ src=raw_path / "mlsp_contest_dataset/supplemental_data/segment_features.txt",
169
+ dst=public_path / "supplemental_data/segment_features.txt",
170
+ old_id_to_new=old_id_to_new,
171
+ )
172
+ filter_and_write_file(
173
+ src=raw_path / "mlsp_contest_dataset/supplemental_data/segment_rectangles.txt",
174
+ dst=public_path / "supplemental_data/segment_rectangles.txt",
175
+ old_id_to_new=old_id_to_new,
176
+ )
177
+
178
+
179
+ def prepare(raw: Path, public: Path, private: Path):
180
+ """
181
+ Splits the data in raw into public and private datasets with appropriate test/train splits.
182
+ Also creates a secondary validation split in public_val and private_val directories.
183
+ """
184
+ # extract only what we need
185
+ extract(raw / "mlsp_contest_dataset.zip", raw)
186
+
187
+ # --- Load all necessary data once ---
188
+ full_cv_folds = read_csv(raw / "mlsp_contest_dataset/essential_data/CVfolds_2.txt")
189
+ full_rec_id2filename = read_csv(raw / "mlsp_contest_dataset/essential_data/rec_id2filename.txt")
190
+ rec_labels_raw = (
191
+ open(raw / "mlsp_contest_dataset/essential_data/rec_labels_test_hidden.txt")
192
+ .read()
193
+ .splitlines()
194
+ )
195
+ rec_labels_raw = rec_labels_raw[1:] # Ignore header line
196
+ full_rec_labels_split = []
197
+ for i in rec_labels_raw:
198
+ rec_id = i.split(",")[0]
199
+ labels = ",".join(i.split(",")[1:]) if len(i.split(",")) > 1 else ""
200
+ full_rec_labels_split.append((int(rec_id), labels))
201
+
202
+ # --- 1. Create the original Train/Test split ---
203
+ # Filter to only the data used in the original split logic and create test set
204
+ cv_folds = full_cv_folds[full_cv_folds["fold"] == 0].reset_index(drop=True)
205
+ test_indices = cv_folds.sample(frac=0.2, random_state=0).index
206
+ cv_folds.loc[test_indices, "fold"] = 1
207
+ assert len(test_indices) == 64, f"Expected 64 test rec_ids, got {len(test_indices)}"
208
+
209
+ # Process and write files for the original public/private split
210
+ _process_split(
211
+ cv_folds=cv_folds,
212
+ full_rec_id2filename=full_rec_id2filename,
213
+ full_rec_labels_split=full_rec_labels_split,
214
+ raw_path=raw,
215
+ public_path=public,
216
+ private_path=private,
217
+ )
218
+
219
+ # --- 2. Create the new Train/Validation split from the first split's TRAIN set ---
220
+ # Define new paths for the validation split
221
+ public_val = public.parent / "public_val"
222
+ private_val = private.parent / "private_val"
223
+
224
+ # Take the training data from the first split (fold=0)
225
+ train_from_first_split = cv_folds[cv_folds["fold"] == 0].reset_index(drop=True)
226
+
227
+ # Re-split this training data to create a new train_val and test_val set
228
+ # The new test (validation) set should be ~64 samples.
229
+ # Original train size = 256. 64/256 = 0.25
230
+ val_indices = train_from_first_split.sample(frac=0.25, random_state=0).index
231
+ train_from_first_split.loc[val_indices, "fold"] = 1 # Re-use 'fold=1' to mark test set
232
+
233
+ # Process and write files for the new validation split
234
+ _process_split(
235
+ cv_folds=train_from_first_split,
236
+ full_rec_id2filename=full_rec_id2filename,
237
+ full_rec_labels_split=full_rec_labels_split,
238
+ raw_path=raw,
239
+ public_path=public_val,
240
+ private_path=private_val,
241
+ )
@@ -0,0 +1,11 @@
1
+ import pandas as pd
2
+ from sklearn.metrics import accuracy_score
3
+
4
+ from mlebench.competitions.utils import prepare_for_accuracy_metric
5
+
6
+
7
+ def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
8
+ accuracy_inputs = prepare_for_accuracy_metric(
9
+ submission=submission, answers=answers, target_column="Sentiment", id_column="PhraseId"
10
+ )
11
+ return accuracy_score(**accuracy_inputs)
@@ -0,0 +1,58 @@
1
+ import shutil
2
+ from pathlib import Path
3
+
4
+ from pandas import read_csv
5
+ from sklearn.model_selection import train_test_split
6
+
7
+ from mlebench.utils import extract
8
+
9
+
10
+ def prepare(raw: Path, public: Path, private: Path):
11
+
12
+ # Extract
13
+ extract(raw / "train.tsv.zip", raw)
14
+
15
+ # Create train and test splits from train set
16
+ test_ratio = 0.3 # 66293/(156061+66293) = 0.3
17
+ old_train = read_csv(raw / "train.tsv", sep="\t")
18
+ new_train, answers = train_test_split(old_train, test_size=test_ratio, random_state=0)
19
+
20
+ # Create public test
21
+ new_test = answers.copy()
22
+ new_test = new_test.drop("Sentiment", axis="columns")
23
+
24
+ # Create sample submission
25
+ sample_submission = answers[["PhraseId", "Sentiment"]].copy()
26
+ sample_submission["Sentiment"] = 2
27
+
28
+ # Checks
29
+ assert new_train["PhraseId"].is_unique, "PhraseId in new_train should be unique"
30
+ assert new_test["PhraseId"].is_unique, "PhraseId in new_test should be unique"
31
+ assert set(new_train["PhraseId"]).isdisjoint(
32
+ set(new_test["PhraseId"])
33
+ ), "PhraseId in new_train and new_test should be disjoint"
34
+ assert (
35
+ new_train.shape[0] + new_test.shape[0] == old_train.shape[0]
36
+ ), "New train and new test should have the same number of rows as the old train set"
37
+ assert (
38
+ new_train.columns.tolist() == old_train.columns.tolist()
39
+ ), "New train and old train should have the same columns"
40
+ assert new_test.columns.tolist() == [
41
+ "PhraseId",
42
+ "SentenceId",
43
+ "Phrase",
44
+ ], "new_test should have columns ['PhraseId', 'SentenceId', 'Phrase']"
45
+
46
+ # Write CSVs
47
+ answers.to_csv(private / "answers.csv", index=False)
48
+ new_train.to_csv(public / "train.tsv", index=False, sep="\t")
49
+ new_test.to_csv(public / "test.tsv", index=False, sep="\t")
50
+ sample_submission.to_csv(public / "sampleSubmission.csv", index=False)
51
+
52
+ # Zip files
53
+ shutil.make_archive(str(public / "train.tsv"), "zip", public, "train.tsv")
54
+ shutil.make_archive(str(public / "test.tsv"), "zip", public, "test.tsv")
55
+
56
+ # Delete unzipped files
57
+ (public / "train.tsv").unlink()
58
+ (public / "test.tsv").unlink()