dslighting 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (352) hide show
  1. dslighting/__init__.py +1 -1
  2. dslighting/core/agent.py +78 -62
  3. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/METADATA +1 -1
  4. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/RECORD +352 -7
  5. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/top_level.txt +1 -0
  6. mlebench/README.md +39 -0
  7. mlebench/__init__.py +0 -0
  8. mlebench/cli.py +221 -0
  9. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/grade.py +161 -0
  10. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/mAP_evaluation.py +425 -0
  11. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare.py +483 -0
  12. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare_val.py +719 -0
  13. mlebench/competitions/AI4Code/grade.py +70 -0
  14. mlebench/competitions/AI4Code/prepare.py +84 -0
  15. mlebench/competitions/AI4Code/prepare_val.py +159 -0
  16. mlebench/competitions/__init__.py +0 -0
  17. mlebench/competitions/aerial-cactus-identification/grade.py +11 -0
  18. mlebench/competitions/aerial-cactus-identification/prepare.py +71 -0
  19. mlebench/competitions/aerial-cactus-identification/prepare_val.py +133 -0
  20. mlebench/competitions/alaska2-image-steganalysis/grade.py +136 -0
  21. mlebench/competitions/alaska2-image-steganalysis/prepare.py +88 -0
  22. mlebench/competitions/alaska2-image-steganalysis/prepare_val.py +148 -0
  23. mlebench/competitions/aptos2019-blindness-detection/grade.py +35 -0
  24. mlebench/competitions/aptos2019-blindness-detection/prepare.py +75 -0
  25. mlebench/competitions/aptos2019-blindness-detection/prepare_val.py +123 -0
  26. mlebench/competitions/bike-sharing-demand/__init__.py +0 -0
  27. mlebench/competitions/bike-sharing-demand/grade.py +55 -0
  28. mlebench/competitions/bike-sharing-demand/prepare.py +37 -0
  29. mlebench/competitions/billion-word-imputation/grade.py +37 -0
  30. mlebench/competitions/billion-word-imputation/prepare.py +107 -0
  31. mlebench/competitions/billion-word-imputation/prepare_val.py +179 -0
  32. mlebench/competitions/bms-molecular-translation/grade.py +40 -0
  33. mlebench/competitions/bms-molecular-translation/prepare.py +68 -0
  34. mlebench/competitions/bms-molecular-translation/prepare_val.py +131 -0
  35. mlebench/competitions/cassava-leaf-disease-classification/grade.py +12 -0
  36. mlebench/competitions/cassava-leaf-disease-classification/prepare.py +113 -0
  37. mlebench/competitions/cassava-leaf-disease-classification/prepare_val.py +186 -0
  38. mlebench/competitions/cdiscount-image-classification-challenge/grade.py +11 -0
  39. mlebench/competitions/cdiscount-image-classification-challenge/prepare.py +144 -0
  40. mlebench/competitions/cdiscount-image-classification-challenge/prepare_val.py +205 -0
  41. mlebench/competitions/chaii-hindi-and-tamil-question-answering/grade.py +67 -0
  42. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare.py +31 -0
  43. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare_val.py +94 -0
  44. mlebench/competitions/champs-scalar-coupling/grade.py +60 -0
  45. mlebench/competitions/champs-scalar-coupling/prepare.py +116 -0
  46. mlebench/competitions/champs-scalar-coupling/prepare_val.py +155 -0
  47. mlebench/competitions/conways-reverse-game-of-life-2020/__init__.py +0 -0
  48. mlebench/competitions/conways-reverse-game-of-life-2020/grade.py +40 -0
  49. mlebench/competitions/conways-reverse-game-of-life-2020/prepare.py +41 -0
  50. mlebench/competitions/demand-forecasting-kernels-only/__init__.py +0 -0
  51. mlebench/competitions/demand-forecasting-kernels-only/grade.py +66 -0
  52. mlebench/competitions/demand-forecasting-kernels-only/prepare.py +27 -0
  53. mlebench/competitions/demand_forecasting_kernels_only/__init__.py +0 -0
  54. mlebench/competitions/demand_forecasting_kernels_only/grade.py +66 -0
  55. mlebench/competitions/demand_forecasting_kernels_only/prepare.py +27 -0
  56. mlebench/competitions/denoising-dirty-documents/grade.py +44 -0
  57. mlebench/competitions/denoising-dirty-documents/prepare.py +134 -0
  58. mlebench/competitions/denoising-dirty-documents/prepare_val.py +178 -0
  59. mlebench/competitions/detecting-insults-in-social-commentary/grade.py +11 -0
  60. mlebench/competitions/detecting-insults-in-social-commentary/prepare.py +72 -0
  61. mlebench/competitions/detecting-insults-in-social-commentary/prepare_val.py +128 -0
  62. mlebench/competitions/dog-breed-identification/dogs.py +124 -0
  63. mlebench/competitions/dog-breed-identification/grade.py +42 -0
  64. mlebench/competitions/dog-breed-identification/prepare.py +55 -0
  65. mlebench/competitions/dog-breed-identification/prepare_val.py +104 -0
  66. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/grade.py +43 -0
  67. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare.py +70 -0
  68. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare_val.py +143 -0
  69. mlebench/competitions/ethanol-concentration/grade.py +23 -0
  70. mlebench/competitions/ethanol-concentration/prepare.py +90 -0
  71. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/grade.py +60 -0
  72. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare.py +41 -0
  73. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare_val.py +92 -0
  74. mlebench/competitions/feedback-prize-english-language-learning/__init__.py +0 -0
  75. mlebench/competitions/feedback-prize-english-language-learning/grade.py +60 -0
  76. mlebench/competitions/feedback-prize-english-language-learning/prepare.py +39 -0
  77. mlebench/competitions/freesound-audio-tagging-2019/grade.py +64 -0
  78. mlebench/competitions/freesound-audio-tagging-2019/prepare.py +94 -0
  79. mlebench/competitions/freesound-audio-tagging-2019/prepare_val.py +175 -0
  80. mlebench/competitions/freesound-audio-tagging-2019/vocabulary.py +83 -0
  81. mlebench/competitions/google-quest-challenge/classes.py +32 -0
  82. mlebench/competitions/google-quest-challenge/grade.py +45 -0
  83. mlebench/competitions/google-quest-challenge/prepare.py +58 -0
  84. mlebench/competitions/google-quest-challenge/prepare_val.py +120 -0
  85. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/grade.py +77 -0
  86. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare.py +155 -0
  87. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare_val.py +211 -0
  88. mlebench/competitions/h-and-m-personalized-fashion-recommendations/grade.py +42 -0
  89. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare.py +102 -0
  90. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare_val.py +132 -0
  91. mlebench/competitions/handwriting/grade.py +23 -0
  92. mlebench/competitions/handwriting/prepare.py +179 -0
  93. mlebench/competitions/herbarium-2020-fgvc7/grade.py +34 -0
  94. mlebench/competitions/herbarium-2020-fgvc7/prepare.py +251 -0
  95. mlebench/competitions/herbarium-2020-fgvc7/prepare_val.py +242 -0
  96. mlebench/competitions/herbarium-2021-fgvc8/grade.py +34 -0
  97. mlebench/competitions/herbarium-2021-fgvc8/prepare.py +251 -0
  98. mlebench/competitions/herbarium-2021-fgvc8/prepare_val.py +222 -0
  99. mlebench/competitions/herbarium-2022-fgvc9/grade.py +31 -0
  100. mlebench/competitions/herbarium-2022-fgvc9/prepare.py +233 -0
  101. mlebench/competitions/herbarium-2022-fgvc9/prepare_val.py +213 -0
  102. mlebench/competitions/histopathologic-cancer-detection/grade.py +12 -0
  103. mlebench/competitions/histopathologic-cancer-detection/prepare.py +59 -0
  104. mlebench/competitions/histopathologic-cancer-detection/prepare_val.py +131 -0
  105. mlebench/competitions/hms-harmful-brain-activity-classification/constants.py +9 -0
  106. mlebench/competitions/hms-harmful-brain-activity-classification/grade.py +43 -0
  107. mlebench/competitions/hms-harmful-brain-activity-classification/kaggle_metric_utilities.py +96 -0
  108. mlebench/competitions/hms-harmful-brain-activity-classification/kullback_leibler_divergence.py +118 -0
  109. mlebench/competitions/hms-harmful-brain-activity-classification/prepare.py +121 -0
  110. mlebench/competitions/hms-harmful-brain-activity-classification/prepare_val.py +190 -0
  111. mlebench/competitions/hotel-id-2021-fgvc8/grade.py +41 -0
  112. mlebench/competitions/hotel-id-2021-fgvc8/prepare.py +63 -0
  113. mlebench/competitions/hotel-id-2021-fgvc8/prepare_val.py +132 -0
  114. mlebench/competitions/hubmap-kidney-segmentation/grade.py +62 -0
  115. mlebench/competitions/hubmap-kidney-segmentation/prepare.py +108 -0
  116. mlebench/competitions/hubmap-kidney-segmentation/prepare_val.py +153 -0
  117. mlebench/competitions/icecube-neutrinos-in-deep-ice/grade.py +111 -0
  118. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare.py +127 -0
  119. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare_val.py +183 -0
  120. mlebench/competitions/ili/grade.py +60 -0
  121. mlebench/competitions/ili/prepare.py +99 -0
  122. mlebench/competitions/imet-2020-fgvc7/grade.py +54 -0
  123. mlebench/competitions/imet-2020-fgvc7/prepare.py +77 -0
  124. mlebench/competitions/imet-2020-fgvc7/prepare_val.py +157 -0
  125. mlebench/competitions/inaturalist-2019-fgvc6/grade.py +35 -0
  126. mlebench/competitions/inaturalist-2019-fgvc6/prepare.py +259 -0
  127. mlebench/competitions/inaturalist-2019-fgvc6/prepare_val.py +304 -0
  128. mlebench/competitions/instant-gratification/__init__.py +0 -0
  129. mlebench/competitions/instant-gratification/grade.py +55 -0
  130. mlebench/competitions/instant-gratification/prepare.py +25 -0
  131. mlebench/competitions/instant_gratification/__init__.py +0 -0
  132. mlebench/competitions/instant_gratification/grade.py +55 -0
  133. mlebench/competitions/instant_gratification/prepare.py +25 -0
  134. mlebench/competitions/invasive-species-monitoring/grade.py +11 -0
  135. mlebench/competitions/invasive-species-monitoring/prepare.py +97 -0
  136. mlebench/competitions/invasive-species-monitoring/prepare_val.py +164 -0
  137. mlebench/competitions/iwildcam-2019-fgvc6/grade.py +44 -0
  138. mlebench/competitions/iwildcam-2019-fgvc6/prepare.py +118 -0
  139. mlebench/competitions/iwildcam-2019-fgvc6/prepare_val.py +194 -0
  140. mlebench/competitions/iwildcam-2020-fgvc7/grade.py +11 -0
  141. mlebench/competitions/iwildcam-2020-fgvc7/prepare.py +164 -0
  142. mlebench/competitions/iwildcam-2020-fgvc7/prepare_val.py +245 -0
  143. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/classes.py +1 -0
  144. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/grade.py +54 -0
  145. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare.py +42 -0
  146. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare_val.py +88 -0
  147. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/grade.py +153 -0
  148. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare.py +36 -0
  149. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare_val.py +117 -0
  150. mlebench/competitions/kuzushiji-recognition/grade.py +58 -0
  151. mlebench/competitions/kuzushiji-recognition/kuzushiji_metric.py +118 -0
  152. mlebench/competitions/kuzushiji-recognition/prepare.py +92 -0
  153. mlebench/competitions/kuzushiji-recognition/prepare_val.py +149 -0
  154. mlebench/competitions/leaf-classification/classes.py +101 -0
  155. mlebench/competitions/leaf-classification/grade.py +44 -0
  156. mlebench/competitions/leaf-classification/prepare.py +60 -0
  157. mlebench/competitions/leaf-classification/prepare_val.py +116 -0
  158. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/grade.py +44 -0
  159. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare.py +51 -0
  160. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare_val.py +96 -0
  161. mlebench/competitions/liverpool-ion-switching/__init__.py +0 -0
  162. mlebench/competitions/liverpool-ion-switching/grade.py +52 -0
  163. mlebench/competitions/liverpool-ion-switching/prepare.py +27 -0
  164. mlebench/competitions/liverpool_ion_switching/__init__.py +0 -0
  165. mlebench/competitions/liverpool_ion_switching/grade.py +52 -0
  166. mlebench/competitions/liverpool_ion_switching/prepare.py +27 -0
  167. mlebench/competitions/lmsys-chatbot-arena/grade.py +63 -0
  168. mlebench/competitions/lmsys-chatbot-arena/prepare.py +52 -0
  169. mlebench/competitions/lmsys-chatbot-arena/prepare_val.py +115 -0
  170. mlebench/competitions/mcm_2024_c_test/grade.py +107 -0
  171. mlebench/competitions/mcm_2024_c_test/prepare.py +2 -0
  172. mlebench/competitions/ml2021spring-hw2/grade.py +11 -0
  173. mlebench/competitions/ml2021spring-hw2/prepare.py +58 -0
  174. mlebench/competitions/ml2021spring-hw2/prepare_val.py +135 -0
  175. mlebench/competitions/mlsp-2013-birds/grade.py +11 -0
  176. mlebench/competitions/mlsp-2013-birds/prepare.py +182 -0
  177. mlebench/competitions/mlsp-2013-birds/prepare_val.py +241 -0
  178. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/grade.py +11 -0
  179. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare.py +58 -0
  180. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare_val.py +120 -0
  181. mlebench/competitions/multi-modal-gesture-recognition/grade.py +58 -0
  182. mlebench/competitions/multi-modal-gesture-recognition/prepare.py +85 -0
  183. mlebench/competitions/multi-modal-gesture-recognition/prepare_val.py +139 -0
  184. mlebench/competitions/my-custom-task-01/prepare.py +2 -0
  185. mlebench/competitions/new-my-task-01/prepare.py +2 -0
  186. mlebench/competitions/new-my-task-03/grade.py +107 -0
  187. mlebench/competitions/new-my-task-03/prepare.py +2 -0
  188. mlebench/competitions/new-york-city-taxi-fare-prediction/grade.py +28 -0
  189. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare.py +44 -0
  190. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare_val.py +89 -0
  191. mlebench/competitions/nfl-player-contact-detection/grade.py +36 -0
  192. mlebench/competitions/nfl-player-contact-detection/prepare.py +101 -0
  193. mlebench/competitions/nfl-player-contact-detection/prepare_val.py +186 -0
  194. mlebench/competitions/nomad2018-predict-transparent-conductors/grade.py +47 -0
  195. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare.py +77 -0
  196. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare_val.py +144 -0
  197. mlebench/competitions/osic-pulmonary-fibrosis-progression/grade.py +74 -0
  198. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare.py +95 -0
  199. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare_val.py +167 -0
  200. mlebench/competitions/paddy-disease-classification/grade.py +35 -0
  201. mlebench/competitions/paddy-disease-classification/prepare.py +69 -0
  202. mlebench/competitions/paddy-disease-classification/prepare_val.py +122 -0
  203. mlebench/competitions/petfinder-pawpularity-score/grade.py +41 -0
  204. mlebench/competitions/petfinder-pawpularity-score/prepare.py +76 -0
  205. mlebench/competitions/petfinder-pawpularity-score/prepare_val.py +154 -0
  206. mlebench/competitions/plant-pathology-2020-fgvc7/grade.py +41 -0
  207. mlebench/competitions/plant-pathology-2020-fgvc7/prepare.py +74 -0
  208. mlebench/competitions/plant-pathology-2020-fgvc7/prepare_val.py +160 -0
  209. mlebench/competitions/plant-pathology-2021-fgvc8/grade.py +54 -0
  210. mlebench/competitions/plant-pathology-2021-fgvc8/prepare.py +65 -0
  211. mlebench/competitions/plant-pathology-2021-fgvc8/prepare_val.py +130 -0
  212. mlebench/competitions/plant-seedlings-classification/grade.py +39 -0
  213. mlebench/competitions/plant-seedlings-classification/prepare.py +91 -0
  214. mlebench/competitions/plant-seedlings-classification/prepare_val.py +158 -0
  215. mlebench/competitions/playground-series-s3e1/__init__.py +0 -0
  216. mlebench/competitions/playground-series-s3e1/grade.py +52 -0
  217. mlebench/competitions/playground-series-s3e1/prepare.py +25 -0
  218. mlebench/competitions/playground-series-s3e11/__init__.py +0 -0
  219. mlebench/competitions/playground-series-s3e11/grade.py +55 -0
  220. mlebench/competitions/playground-series-s3e11/prepare.py +25 -0
  221. mlebench/competitions/playground-series-s3e18/grade.py +39 -0
  222. mlebench/competitions/playground-series-s3e18/prepare.py +36 -0
  223. mlebench/competitions/playground-series-s3e18/prepare_val.py +89 -0
  224. mlebench/competitions/playground_series_s3e1/__init__.py +0 -0
  225. mlebench/competitions/playground_series_s3e1/grade.py +52 -0
  226. mlebench/competitions/playground_series_s3e1/prepare.py +25 -0
  227. mlebench/competitions/playground_series_s3e11/__init__.py +0 -0
  228. mlebench/competitions/playground_series_s3e11/grade.py +55 -0
  229. mlebench/competitions/playground_series_s3e11/prepare.py +25 -0
  230. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/grade.py +44 -0
  231. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare.py +68 -0
  232. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare_val.py +146 -0
  233. mlebench/competitions/random-acts-of-pizza/grade.py +14 -0
  234. mlebench/competitions/random-acts-of-pizza/prepare.py +80 -0
  235. mlebench/competitions/random-acts-of-pizza/prepare_val.py +144 -0
  236. mlebench/competitions/ranzcr-clip-catheter-line-classification/classes.py +11 -0
  237. mlebench/competitions/ranzcr-clip-catheter-line-classification/grade.py +31 -0
  238. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare.py +53 -0
  239. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare_val.py +113 -0
  240. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/grade.py +124 -0
  241. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare.py +219 -0
  242. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare_val.py +257 -0
  243. mlebench/competitions/rsna-breast-cancer-detection/grade.py +65 -0
  244. mlebench/competitions/rsna-breast-cancer-detection/prepare.py +141 -0
  245. mlebench/competitions/rsna-breast-cancer-detection/prepare_val.py +201 -0
  246. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/grade.py +13 -0
  247. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare.py +47 -0
  248. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare_val.py +97 -0
  249. mlebench/competitions/santander-customer-satisfaction/grade.py +10 -0
  250. mlebench/competitions/santander-customer-satisfaction/prepare.py +41 -0
  251. mlebench/competitions/sciencebench-001-clintox-nn/__init__.py +0 -0
  252. mlebench/competitions/sciencebench-001-clintox-nn/grade.py +56 -0
  253. mlebench/competitions/sciencebench-001-clintox-nn/prepare.py +75 -0
  254. mlebench/competitions/sciencebench-015-aai/grade.py +37 -0
  255. mlebench/competitions/sciencebench-015-aai/prepare.py +102 -0
  256. mlebench/competitions/sciencebench-051-brain-blood-qsar/grade.py +58 -0
  257. mlebench/competitions/sciencebench-051-brain-blood-qsar/prepare.py +69 -0
  258. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/grade.py +55 -0
  259. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/prepare.py +88 -0
  260. mlebench/competitions/see-click-predict-fix/__init__.py +0 -0
  261. mlebench/competitions/see-click-predict-fix/grade.py +66 -0
  262. mlebench/competitions/see-click-predict-fix/prepare.py +25 -0
  263. mlebench/competitions/see_click_predict_fix/__init__.py +0 -0
  264. mlebench/competitions/see_click_predict_fix/grade.py +66 -0
  265. mlebench/competitions/see_click_predict_fix/prepare.py +25 -0
  266. mlebench/competitions/seti-breakthrough-listen/grade.py +11 -0
  267. mlebench/competitions/seti-breakthrough-listen/prepare.py +71 -0
  268. mlebench/competitions/seti-breakthrough-listen/prepare_val.py +159 -0
  269. mlebench/competitions/siim-covid19-detection/grade.py +194 -0
  270. mlebench/competitions/siim-covid19-detection/prepare.py +123 -0
  271. mlebench/competitions/siim-covid19-detection/prepare_val.py +164 -0
  272. mlebench/competitions/siim-isic-melanoma-classification/grade.py +11 -0
  273. mlebench/competitions/siim-isic-melanoma-classification/prepare.py +127 -0
  274. mlebench/competitions/siim-isic-melanoma-classification/prepare_val.py +158 -0
  275. mlebench/competitions/smartphone-decimeter-2022/grade.py +55 -0
  276. mlebench/competitions/smartphone-decimeter-2022/notebook.py +86 -0
  277. mlebench/competitions/smartphone-decimeter-2022/prepare.py +143 -0
  278. mlebench/competitions/smartphone-decimeter-2022/prepare_val.py +199 -0
  279. mlebench/competitions/spaceship-titanic/grade.py +11 -0
  280. mlebench/competitions/spaceship-titanic/prepare.py +23 -0
  281. mlebench/competitions/spaceship-titanic/prepare_val.py +61 -0
  282. mlebench/competitions/spooky-author-identification/classes.py +1 -0
  283. mlebench/competitions/spooky-author-identification/grade.py +38 -0
  284. mlebench/competitions/spooky-author-identification/prepare.py +40 -0
  285. mlebench/competitions/spooky-author-identification/prepare_val.py +78 -0
  286. mlebench/competitions/stanford-covid-vaccine/grade.py +65 -0
  287. mlebench/competitions/stanford-covid-vaccine/prepare.py +129 -0
  288. mlebench/competitions/stanford-covid-vaccine/prepare_val.py +199 -0
  289. mlebench/competitions/statoil-iceberg-classifier-challenge/grade.py +41 -0
  290. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare.py +105 -0
  291. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare_val.py +157 -0
  292. mlebench/competitions/tabular-playground-series-dec-2021/grade.py +11 -0
  293. mlebench/competitions/tabular-playground-series-dec-2021/prepare.py +39 -0
  294. mlebench/competitions/tabular-playground-series-dec-2021/prepare_val.py +99 -0
  295. mlebench/competitions/tabular-playground-series-may-2022/grade.py +9 -0
  296. mlebench/competitions/tabular-playground-series-may-2022/prepare.py +56 -0
  297. mlebench/competitions/tabular-playground-series-may-2022/prepare_val.py +116 -0
  298. mlebench/competitions/tensorflow-speech-recognition-challenge/grade.py +11 -0
  299. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare.py +90 -0
  300. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare_val.py +148 -0
  301. mlebench/competitions/tensorflow2-question-answering/grade.py +122 -0
  302. mlebench/competitions/tensorflow2-question-answering/prepare.py +122 -0
  303. mlebench/competitions/tensorflow2-question-answering/prepare_val.py +187 -0
  304. mlebench/competitions/text-normalization-challenge-english-language/grade.py +49 -0
  305. mlebench/competitions/text-normalization-challenge-english-language/prepare.py +115 -0
  306. mlebench/competitions/text-normalization-challenge-english-language/prepare_val.py +213 -0
  307. mlebench/competitions/text-normalization-challenge-russian-language/grade.py +49 -0
  308. mlebench/competitions/text-normalization-challenge-russian-language/prepare.py +113 -0
  309. mlebench/competitions/text-normalization-challenge-russian-language/prepare_val.py +165 -0
  310. mlebench/competitions/tgs-salt-identification-challenge/grade.py +144 -0
  311. mlebench/competitions/tgs-salt-identification-challenge/prepare.py +158 -0
  312. mlebench/competitions/tgs-salt-identification-challenge/prepare_val.py +166 -0
  313. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/grade.py +11 -0
  314. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare.py +95 -0
  315. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare_val.py +141 -0
  316. mlebench/competitions/tmdb-box-office-prediction/__init__.py +0 -0
  317. mlebench/competitions/tmdb-box-office-prediction/grade.py +55 -0
  318. mlebench/competitions/tmdb-box-office-prediction/prepare.py +35 -0
  319. mlebench/competitions/tweet-sentiment-extraction/grade.py +67 -0
  320. mlebench/competitions/tweet-sentiment-extraction/prepare.py +36 -0
  321. mlebench/competitions/tweet-sentiment-extraction/prepare_val.py +106 -0
  322. mlebench/competitions/us-patent-phrase-to-phrase-matching/grade.py +31 -0
  323. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare.py +33 -0
  324. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare_val.py +71 -0
  325. mlebench/competitions/utils.py +266 -0
  326. mlebench/competitions/uw-madison-gi-tract-image-segmentation/grade.py +158 -0
  327. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare.py +139 -0
  328. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare_val.py +193 -0
  329. mlebench/competitions/ventilator-pressure-prediction/__init__.py +0 -0
  330. mlebench/competitions/ventilator-pressure-prediction/grade.py +52 -0
  331. mlebench/competitions/ventilator-pressure-prediction/prepare.py +27 -0
  332. mlebench/competitions/ventilator-pressure-prediction/prepare_val.py +142 -0
  333. mlebench/competitions/ventilator_pressure_prediction/__init__.py +0 -0
  334. mlebench/competitions/ventilator_pressure_prediction/grade.py +52 -0
  335. mlebench/competitions/ventilator_pressure_prediction/prepare.py +27 -0
  336. mlebench/competitions/vesuvius-challenge-ink-detection/grade.py +97 -0
  337. mlebench/competitions/vesuvius-challenge-ink-detection/prepare.py +122 -0
  338. mlebench/competitions/vesuvius-challenge-ink-detection/prepare_val.py +170 -0
  339. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/grade.py +220 -0
  340. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare.py +129 -0
  341. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare_val.py +204 -0
  342. mlebench/competitions/whale-categorization-playground/grade.py +41 -0
  343. mlebench/competitions/whale-categorization-playground/prepare.py +103 -0
  344. mlebench/competitions/whale-categorization-playground/prepare_val.py +196 -0
  345. mlebench/data.py +420 -0
  346. mlebench/grade.py +209 -0
  347. mlebench/grade_helpers.py +235 -0
  348. mlebench/metrics.py +75 -0
  349. mlebench/registry.py +332 -0
  350. mlebench/utils.py +346 -0
  351. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/WHEEL +0 -0
  352. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,219 @@
1
+ import random
2
+ import shutil
3
+ from pathlib import Path
4
+
5
+ from tqdm.auto import tqdm
6
+
7
+ from mlebench.utils import read_csv
8
+
9
+
10
+ def prepare(raw: Path, public: Path, private: Path):
11
+ rng = random.Random(0)
12
+
13
+ # there are two subsets of training data:
14
+
15
+ # 1. one of instances that have bounding boxes
16
+ # 2. one of instances that have segmentations
17
+
18
+ # we need to preserve the ratios of the sizes of these subsets to the total train samples
19
+
20
+ # additionally, there is an overlap between the two subsets
21
+ # we need to preserve this overlap
22
+
23
+ DEV = False
24
+ old_train = read_csv(raw / "train.csv")
25
+
26
+ num_old_train = len(old_train)
27
+ if DEV:
28
+ DEV_RATIO = 0.175
29
+ num_old_train = round(DEV_RATIO * num_old_train)
30
+
31
+ # 2019 train folders (StudyInstanceUIDs), 1500 test folders, 2019 / (1500 + 2019) ~ 0.60 original train ratio
32
+ # each folder has ~ 300 images
33
+ # We use 0.1 ratio to avoid taking too many samples out of train
34
+ TRAIN_RATIO = 0.1
35
+ num_train_samples = round(num_old_train * TRAIN_RATIO)
36
+
37
+ # bboxes
38
+ old_train_bboxes = read_csv(raw / "train_bounding_boxes.csv")
39
+ if DEV:
40
+ old_train_bboxes = old_train_bboxes.sample(frac=DEV_RATIO, random_state=0)
41
+
42
+ old_train_bbox_ids = sorted(old_train_bboxes["StudyInstanceUID"].unique())
43
+ old_num_train_bbox_ids = len(old_train_bbox_ids) # 235
44
+ new_num_train_bbox_ids = round(old_num_train_bbox_ids * TRAIN_RATIO)
45
+
46
+ # segmentations
47
+ old_train_segmentation_path = raw / "segmentations"
48
+ old_train_segmentation_ids = sorted([f.stem for f in old_train_segmentation_path.glob("*.nii")])
49
+ if DEV:
50
+ old_train_segmentation_ids = rng.sample(
51
+ old_train_segmentation_ids, round(DEV_RATIO * len(old_train_segmentation_ids))
52
+ )
53
+ old_num_train_segmentation_ids = len(old_train_segmentation_ids) # 87
54
+ new_num_train_segmentation_ids = round(old_num_train_segmentation_ids * TRAIN_RATIO)
55
+
56
+ # overlap: list of StudyInstanceUIDs that have both bounding boxes and segmentations
57
+ old_overlap_ids = [uid for uid in old_train_bbox_ids if uid in old_train_segmentation_ids]
58
+ old_num_overlap = len(old_overlap_ids) # 40
59
+ new_num_overlap = round(old_num_overlap * TRAIN_RATIO)
60
+
61
+ # start populating new train by picking the overlap instances
62
+ # sample new_num_overlap instances from the overlap randomly
63
+ new_overlap_ids = rng.sample(old_overlap_ids, new_num_overlap)
64
+ new_bboxes_ids = new_overlap_ids.copy()
65
+ new_segmentations_ids = new_overlap_ids.copy()
66
+ new_train_ids = new_overlap_ids.copy()
67
+
68
+ # add the `new_num_train_segmentation_ids - new_num_overlap`, that are not in the overlap
69
+ additional_segmentation_ids = rng.sample(
70
+ [uid for uid in old_train_segmentation_ids if uid not in old_overlap_ids],
71
+ new_num_train_segmentation_ids - new_num_overlap,
72
+ )
73
+ new_segmentations_ids += additional_segmentation_ids
74
+ new_train_ids += additional_segmentation_ids
75
+
76
+ # add the (`new_num_train_bbox_ids - num_num_overlap`) segmentations, that are not in the overlap
77
+ additional_bbox_ids = rng.sample(
78
+ [uid for uid in old_train_bbox_ids if uid not in old_overlap_ids],
79
+ new_num_train_bbox_ids - new_num_overlap,
80
+ )
81
+ new_bboxes_ids += additional_bbox_ids
82
+ new_train_ids += additional_bbox_ids
83
+
84
+ if DEV:
85
+ # old train has whatever is currently in new_train_ids
86
+ # + a random sample of the rest, s.t. its 15% of the original train
87
+ dev_old_train_ids = new_train_ids + rng.sample(
88
+ [uid for uid in old_train["StudyInstanceUID"] if uid not in new_train_ids],
89
+ num_old_train - len(new_train_ids),
90
+ )
91
+ old_train = old_train[old_train["StudyInstanceUID"].isin(dev_old_train_ids)].copy()
92
+
93
+ # then, fill the rest of the new train.
94
+ new_train_ids += rng.sample(
95
+ [uid for uid in old_train["StudyInstanceUID"] if uid not in new_train_ids],
96
+ num_train_samples - len(new_train_ids),
97
+ )
98
+
99
+ train = old_train[old_train["StudyInstanceUID"].isin(new_train_ids)].copy()
100
+ train.to_csv(public / "train.csv", index=False)
101
+
102
+ train_bboxes = old_train_bboxes[
103
+ old_train_bboxes["StudyInstanceUID"].isin(new_bboxes_ids)
104
+ ].copy()
105
+ train_bboxes.to_csv(public / "train_bounding_boxes.csv", index=False)
106
+
107
+ answers = old_train[~old_train["StudyInstanceUID"].isin(new_train_ids)].copy()
108
+ # columns become rows for the test and sample submission, so also for answers
109
+ answers = answers.melt(
110
+ id_vars="StudyInstanceUID", var_name="prediction_type", value_name="fractured"
111
+ )
112
+ answers["row_id"] = answers["StudyInstanceUID"] + "_" + answers["prediction_type"]
113
+ answers.to_csv(private / "answers.csv", index=False)
114
+
115
+ sample_submission = answers[["row_id", "fractured"]].copy()
116
+ sample_submission["fractured"] = 0.5
117
+ sample_submission.to_csv(public / "sample_submission.csv", index=False)
118
+
119
+ public_test = answers.drop(columns=["fractured"]).copy()
120
+ public_test.to_csv(public / "test.csv", index=False)
121
+
122
+ # assert that the melting worked
123
+ assert answers["StudyInstanceUID"].nunique() * 8 == len(
124
+ answers
125
+ ), "Melting failed, incorrect length"
126
+ assert answers.columns.tolist() == [
127
+ "StudyInstanceUID",
128
+ "prediction_type",
129
+ "fractured",
130
+ "row_id",
131
+ ], "Melting went wrong, columns are wrong"
132
+
133
+ # column checks
134
+ train_cols = ["StudyInstanceUID", "patient_overall", "C1", "C2", "C3", "C4", "C5", "C6", "C7"]
135
+ assert train.columns.tolist() == train_cols, "Train columns are wrong"
136
+ bbox_cols = ["StudyInstanceUID", "x", "y", "width", "height", "slice_number"]
137
+ assert train_bboxes.columns.tolist() == bbox_cols, "Bounding box columns are wrong"
138
+ test_cols = ["StudyInstanceUID", "prediction_type", "row_id"]
139
+ assert public_test.columns.tolist() == test_cols, "Test columns are wrong"
140
+ submission_cols = ["row_id", "fractured"]
141
+ assert sample_submission.columns.tolist() == submission_cols, "Submission columns are wrong"
142
+
143
+ # Check that the correct number of training samples is selected
144
+ assert len(new_train_ids) == round(len(old_train) * TRAIN_RATIO), (
145
+ "Incorrect number of training samples."
146
+ " The number of `new_train_ids` doesn't match the expected number given the `TRAIN_RATIO`."
147
+ )
148
+ assert len(train) + answers["StudyInstanceUID"].nunique() == len(old_train), (
149
+ "Incorrect number of training samples."
150
+ " New train and test splits don't sum to the length of the original train set."
151
+ )
152
+
153
+ # Check that the correct number of bounding box samples is selected
154
+ assert len(new_bboxes_ids) == round(
155
+ len(old_train_bbox_ids) * TRAIN_RATIO
156
+ ), "Incorrect number of bounding box samples"
157
+
158
+ # Check that the correct number of segmentation samples is selected
159
+ assert len(new_segmentations_ids) == round(
160
+ len(old_train_segmentation_ids) * TRAIN_RATIO
161
+ ), "Incorrect number of segmentation samples"
162
+
163
+ # Check that the overlap is preserved
164
+ assert len(new_overlap_ids) == round(
165
+ len(old_overlap_ids) * TRAIN_RATIO
166
+ ), "Incorrect overlap preservation"
167
+
168
+ # check that test and train dont share study instance ids
169
+ assert set(train["StudyInstanceUID"]).isdisjoint(
170
+ set(public_test["StudyInstanceUID"].unique())
171
+ ), "Train and test share study instance ids"
172
+
173
+ # Now that splitting is done, copy over images accordingly
174
+ (public / "segmentations").mkdir(exist_ok=True)
175
+
176
+ for file_id in tqdm(
177
+ new_segmentations_ids, desc="Copying segmentations", total=len(new_segmentations_ids)
178
+ ):
179
+ shutil.copyfile(
180
+ src=old_train_segmentation_path / f"{file_id}.nii",
181
+ dst=public / "segmentations" / f"{file_id}.nii",
182
+ )
183
+
184
+ (public / "train_images").mkdir(exist_ok=True)
185
+ for study_id in tqdm(
186
+ train["StudyInstanceUID"],
187
+ desc="Copying train images",
188
+ total=len(train),
189
+ unit="StudyInstance",
190
+ ):
191
+ shutil.copytree(
192
+ src=raw / "train_images" / study_id,
193
+ dst=public / "train_images" / study_id,
194
+ dirs_exist_ok=True,
195
+ )
196
+ (public / "test_images").mkdir(exist_ok=True)
197
+ for study_id in tqdm(
198
+ public_test["StudyInstanceUID"].unique(),
199
+ desc="Copying test images",
200
+ total=public_test["StudyInstanceUID"].nunique(),
201
+ unit="StudyInstance",
202
+ ):
203
+ shutil.copytree(
204
+ src=raw / "train_images" / study_id,
205
+ dst=public / "test_images" / study_id,
206
+ dirs_exist_ok=True,
207
+ )
208
+
209
+ assert len(list((public / "segmentations").glob("*.nii"))) == len(
210
+ new_segmentations_ids
211
+ ), "Incorrect number of segmentations copied"
212
+ # check that all the right image directories are copied
213
+ assert len(list((public / "train_images").glob("*"))) == len(
214
+ train
215
+ ), "Incorrect number of train images copied"
216
+
217
+ assert (
218
+ len(list((public / "test_images").glob("*"))) == public_test["StudyInstanceUID"].nunique()
219
+ ), "Incorrect number of test images copied"
@@ -0,0 +1,257 @@
1
+ import random
2
+ import shutil
3
+ from pathlib import Path
4
+ from typing import List, Tuple
5
+
6
+ import pandas as pd
7
+ from tqdm.auto import tqdm
8
+
9
+ from mlebench.utils import read_csv
10
+
11
+
12
+ def _split_and_save(
13
+ source_train_df: pd.DataFrame,
14
+ source_bbox_df: pd.DataFrame,
15
+ source_seg_ids: List[str],
16
+ raw_images_path: Path,
17
+ raw_segmentations_path: Path,
18
+ train_ratio: float,
19
+ rng: random.Random,
20
+ public_dir: Path,
21
+ private_dir: Path,
22
+ ) -> Tuple[pd.DataFrame, pd.DataFrame, List[str]]:
23
+ """
24
+ Performs a data split based on provided source data and saves the results.
25
+
26
+ This function replicates the original script's logic for splitting data based on
27
+ ratios of bounding boxes, segmentations, and their overlap. It then saves the
28
+ resulting train/test sets, metadata, and copies image files to the specified
29
+ public and private directories.
30
+
31
+ Args:
32
+ source_train_df: DataFrame with the main training metadata to be split.
33
+ source_bbox_df: DataFrame with bounding box data to be split.
34
+ source_seg_ids: List of StudyInstanceUIDs that have segmentations.
35
+ raw_images_path: Path to the original directory of all study images.
36
+ raw_segmentations_path: Path to the original directory of all segmentations.
37
+ train_ratio: The ratio of the source data to be used for the new training set.
38
+ rng: A random number generator instance for deterministic sampling.
39
+ public_dir: The target public directory for outputs.
40
+ private_dir: The target private directory for outputs.
41
+
42
+ Returns:
43
+ A tuple containing the data for the *training* portion of the split:
44
+ (new_train_df, new_train_bboxes_df, new_segmentation_ids)
45
+ """
46
+ public_dir.mkdir(exist_ok=True, parents=True)
47
+ private_dir.mkdir(exist_ok=True, parents=True)
48
+
49
+ num_source_train = len(source_train_df)
50
+ num_train_samples = round(num_source_train * train_ratio)
51
+
52
+ # bboxes
53
+ source_train_bbox_ids = sorted(source_bbox_df["StudyInstanceUID"].unique())
54
+ source_num_train_bbox_ids = len(source_train_bbox_ids)
55
+ new_num_train_bbox_ids = round(source_num_train_bbox_ids * train_ratio)
56
+
57
+ # segmentations
58
+ source_num_train_segmentation_ids = len(source_seg_ids)
59
+ new_num_train_segmentation_ids = round(source_num_train_segmentation_ids * train_ratio)
60
+
61
+ # overlap: list of StudyInstanceUIDs that have both bounding boxes and segmentations
62
+ source_overlap_ids = [uid for uid in source_train_bbox_ids if uid in source_seg_ids]
63
+ source_num_overlap = len(source_overlap_ids)
64
+ new_num_overlap = round(source_num_overlap * train_ratio)
65
+
66
+ # start populating new train by picking the overlap instances
67
+ # sample new_num_overlap instances from the overlap randomly
68
+ new_overlap_ids = rng.sample(source_overlap_ids, new_num_overlap)
69
+ new_bboxes_ids = new_overlap_ids.copy()
70
+ new_segmentations_ids = new_overlap_ids.copy()
71
+ new_train_ids = new_overlap_ids.copy()
72
+
73
+ # add the `new_num_train_segmentation_ids - new_num_overlap`, that are not in the overlap
74
+ additional_segmentation_ids = rng.sample(
75
+ [uid for uid in source_seg_ids if uid not in source_overlap_ids],
76
+ new_num_train_segmentation_ids - new_num_overlap,
77
+ )
78
+ new_segmentations_ids += additional_segmentation_ids
79
+ new_train_ids += additional_segmentation_ids
80
+
81
+ # add the (`new_num_train_bbox_ids - num_num_overlap`) segmentations, that are not in the overlap
82
+ additional_bbox_ids = rng.sample(
83
+ [uid for uid in source_train_bbox_ids if uid not in source_overlap_ids],
84
+ new_num_train_bbox_ids - new_num_overlap,
85
+ )
86
+ new_bboxes_ids += additional_bbox_ids
87
+ new_train_ids += additional_bbox_ids
88
+
89
+ # then, fill the rest of the new train.
90
+ num_to_sample = num_train_samples - len(new_train_ids)
91
+ available_pool = [uid for uid in source_train_df["StudyInstanceUID"] if uid not in new_train_ids]
92
+ new_train_ids += rng.sample(
93
+ available_pool,
94
+ min(num_to_sample, len(available_pool)), # Avoid sampling more than available
95
+ )
96
+
97
+ train = source_train_df[source_train_df["StudyInstanceUID"].isin(new_train_ids)].copy()
98
+ train.to_csv(public_dir / "train.csv", index=False)
99
+
100
+ train_bboxes = source_bbox_df[
101
+ source_bbox_df["StudyInstanceUID"].isin(new_bboxes_ids)
102
+ ].copy()
103
+ train_bboxes.to_csv(public_dir / "train_bounding_boxes.csv", index=False)
104
+
105
+ answers = source_train_df[~source_train_df["StudyInstanceUID"].isin(new_train_ids)].copy()
106
+ # columns become rows for the test and sample submission, so also for answers
107
+ answers = answers.melt(
108
+ id_vars="StudyInstanceUID", var_name="prediction_type", value_name="fractured"
109
+ )
110
+ answers["row_id"] = answers["StudyInstanceUID"] + "_" + answers["prediction_type"]
111
+ answers.to_csv(private_dir / "answers.csv", index=False)
112
+
113
+ sample_submission = answers[["row_id", "fractured"]].copy()
114
+ sample_submission["fractured"] = 0.5
115
+ sample_submission.to_csv(public_dir / "sample_submission.csv", index=False)
116
+
117
+ public_test = answers.drop(columns=["fractured"]).copy()
118
+ public_test.to_csv(public_dir / "test.csv", index=False)
119
+
120
+ # assert that the melting worked
121
+ if answers["StudyInstanceUID"].nunique() > 0:
122
+ assert answers["StudyInstanceUID"].nunique() * 8 == len(
123
+ answers
124
+ ), "Melting failed, incorrect length"
125
+ assert answers.columns.tolist() == [
126
+ "StudyInstanceUID",
127
+ "prediction_type",
128
+ "fractured",
129
+ "row_id",
130
+ ], "Melting went wrong, columns are wrong"
131
+
132
+ # column checks
133
+ train_cols = ["StudyInstanceUID", "patient_overall", "C1", "C2", "C3", "C4", "C5", "C6", "C7"]
134
+ assert train.columns.tolist() == train_cols, "Train columns are wrong"
135
+ bbox_cols = ["StudyInstanceUID", "x", "y", "width", "height", "slice_number"]
136
+ assert train_bboxes.columns.tolist() == bbox_cols, "Bounding box columns are wrong"
137
+ test_cols = ["StudyInstanceUID", "prediction_type", "row_id"]
138
+ assert public_test.columns.tolist() == test_cols, "Test columns are wrong"
139
+ submission_cols = ["row_id", "fractured"]
140
+ assert sample_submission.columns.tolist() == submission_cols, "Submission columns are wrong"
141
+
142
+ # check that test and train dont share study instance ids
143
+ assert set(train["StudyInstanceUID"]).isdisjoint(
144
+ set(public_test["StudyInstanceUID"].unique())
145
+ ), "Train and test share study instance ids"
146
+
147
+ # Now that splitting is done, copy over images accordingly
148
+ (public_dir / "segmentations").mkdir(exist_ok=True)
149
+ for file_id in tqdm(
150
+ new_segmentations_ids,
151
+ desc=f"Copying segmentations to {public_dir.name}",
152
+ total=len(new_segmentations_ids),
153
+ ):
154
+ shutil.copyfile(
155
+ src=raw_segmentations_path / f"{file_id}.nii",
156
+ dst=public_dir / "segmentations" / f"{file_id}.nii",
157
+ )
158
+
159
+ (public_dir / "train_images").mkdir(exist_ok=True)
160
+ for study_id in tqdm(
161
+ train["StudyInstanceUID"],
162
+ desc=f"Copying train images to {public_dir.name}",
163
+ total=len(train),
164
+ unit="StudyInstance",
165
+ ):
166
+ shutil.copytree(
167
+ src=raw_images_path / study_id,
168
+ dst=public_dir / "train_images" / study_id,
169
+ dirs_exist_ok=True,
170
+ )
171
+ (public_dir / "test_images").mkdir(exist_ok=True)
172
+ for study_id in tqdm(
173
+ public_test["StudyInstanceUID"].unique(),
174
+ desc=f"Copying test images to {public_dir.name}",
175
+ total=public_test["StudyInstanceUID"].nunique(),
176
+ unit="StudyInstance",
177
+ ):
178
+ shutil.copytree(
179
+ src=raw_images_path / study_id,
180
+ dst=public_dir / "test_images" / study_id,
181
+ dirs_exist_ok=True,
182
+ )
183
+
184
+ return train, train_bboxes, new_segmentations_ids
185
+
186
+
187
+ def prepare(raw: Path, public: Path, private: Path):
188
+ rng = random.Random(0)
189
+
190
+ # there are two subsets of training data:
191
+
192
+ # 1. one of instances that have bounding boxes
193
+ # 2. one of instances that have segmentations
194
+
195
+ # we need to preserve the ratios of the sizes of these subsets to the total train samples
196
+
197
+ # additionally, there is an overlap between the two subsets
198
+ # we need to preserve this overlap
199
+
200
+ DEV = False
201
+ old_train = read_csv(raw / "train.csv")
202
+
203
+ num_old_train = len(old_train)
204
+ if DEV:
205
+ # This DEV logic is preserved from the original script to ensure
206
+ # identical behavior if ever enabled. It is currently inactive.
207
+ DEV_RATIO = 0.175
208
+ num_old_train = round(DEV_RATIO * num_old_train)
209
+ # The complex DEV logic from the original script is not fully ported
210
+ # as it was intertwined with the main logic and is disabled by default.
211
+ # This simplified version just subsamples the main dataframe.
212
+ old_train = old_train.sample(n=num_old_train, random_state=0)
213
+
214
+ # 2019 train folders (StudyInstanceUIDs), 1500 test folders, 2019 / (1500 + 2019) ~ 0.60 original train ratio
215
+ # each folder has ~ 300 images
216
+ # We use 0.1 ratio to avoid taking too many samples out of train
217
+ TRAIN_RATIO = 0.1
218
+
219
+ # Load all raw source data once
220
+ old_train_bboxes = read_csv(raw / "train_bounding_boxes.csv")
221
+ old_train_segmentation_path = raw / "segmentations"
222
+ old_train_segmentation_ids = sorted([f.stem for f in old_train_segmentation_path.glob("*.nii")])
223
+
224
+ # === Step 1: Perform the original data split to create `public` and `private` ===
225
+ # This call produces the main train/test split. The outputs in `public` and
226
+ # `private` will be identical to the original script's output.
227
+ # We capture the resulting training set data to be used as the source for our next split.
228
+ train_df, train_bboxes_df, train_seg_ids = _split_and_save(
229
+ source_train_df=old_train,
230
+ source_bbox_df=old_train_bboxes,
231
+ source_seg_ids=old_train_segmentation_ids,
232
+ raw_images_path=raw / "train_images",
233
+ raw_segmentations_path=raw / "segmentations",
234
+ train_ratio=TRAIN_RATIO,
235
+ rng=rng,
236
+ public_dir=public,
237
+ private_dir=private,
238
+ )
239
+
240
+ # === Step 2: Perform a second split on the new training set to create a validation set ===
241
+ # This call takes the *training set* from the first split (`train_df`) and
242
+ # splits it again using the exact same logic and ratio.
243
+ # The results are saved to the new `public_val` and `private_val` directories.
244
+ public_val = public.parent / "public_val"
245
+ private_val = private.parent / "private_val"
246
+
247
+ _split_and_save(
248
+ source_train_df=train_df,
249
+ source_bbox_df=train_bboxes_df,
250
+ source_seg_ids=train_seg_ids,
251
+ raw_images_path=raw / "train_images", # Image source is still the main raw folder
252
+ raw_segmentations_path=raw / "segmentations",
253
+ train_ratio=TRAIN_RATIO, # Use the same split ratio
254
+ rng=rng,
255
+ public_dir=public_val,
256
+ private_dir=private_val,
257
+ )
@@ -0,0 +1,65 @@
1
+ import pandas as pd
2
+
3
+ from mlebench.grade_helpers import InvalidSubmissionError
4
+
5
+
6
+ def pfbeta(labels, predictions, beta):
7
+ """
8
+ Adapted from https://www.kaggle.com/code/sohier/probabilistic-f-score
9
+ adaptations: code formatting, this docstring.
10
+ """
11
+ y_true_count = 0
12
+ ctp = 0
13
+ cfp = 0
14
+
15
+ for idx in range(len(labels)):
16
+ prediction = min(max(predictions[idx], 0), 1)
17
+ if labels[idx]:
18
+ y_true_count += 1
19
+ ctp += prediction
20
+ else:
21
+ cfp += prediction
22
+
23
+ beta_squared = beta * beta
24
+ c_precision = ctp / (ctp + cfp)
25
+ c_recall = ctp / y_true_count
26
+ if c_precision > 0 and c_recall > 0:
27
+ result = (
28
+ (1 + beta_squared) * (c_precision * c_recall) / (beta_squared * c_precision + c_recall)
29
+ )
30
+ return result
31
+ else:
32
+ return 0
33
+
34
+
35
+ def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame) -> dict:
36
+ id_col = "prediction_id"
37
+ target_col = "cancer"
38
+
39
+ assert id_col in answers.columns, f"{id_col} not in answers"
40
+ assert target_col in answers.columns, f"{target_col} not in answers"
41
+
42
+ if id_col not in submission.columns:
43
+ raise InvalidSubmissionError(f"{id_col} not in submission")
44
+ if target_col not in submission.columns:
45
+ raise InvalidSubmissionError(f"{target_col} not in submission")
46
+ if len(submission) != len(answers):
47
+ raise InvalidSubmissionError("submission and answers have different lengths")
48
+
49
+ submission = submission.sort_values(by=id_col)
50
+ answers = answers.sort_values(by=id_col)
51
+
52
+ if (submission[id_col].values != answers[id_col].values).any():
53
+ raise InvalidSubmissionError(f"{id_col} not aligned")
54
+
55
+ labels = answers[target_col].to_numpy()
56
+ predictions = submission[target_col].to_numpy()
57
+
58
+ return {"labels": labels, "predictions": predictions}
59
+
60
+
61
+ def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
62
+ pf1_inputs = prepare_for_metric(submission, answers)
63
+ # F1 score, so beta=1; https://www.kaggle.com/competitions/rsna-breast-cancer-detection/discussion/370123
64
+ score = pfbeta(**pf1_inputs, beta=1)
65
+ return score
@@ -0,0 +1,141 @@
1
+ import shutil
2
+ from pathlib import Path
3
+
4
+ from sklearn.model_selection import train_test_split
5
+ from tqdm.auto import tqdm
6
+
7
+ from mlebench.utils import read_csv
8
+
9
+ # columns in train.csv
10
+ train_columns = [
11
+ "site_id",
12
+ "patient_id",
13
+ "image_id",
14
+ "laterality",
15
+ "view",
16
+ "age",
17
+ "cancer",
18
+ "biopsy",
19
+ "invasive",
20
+ "BIRADS",
21
+ "implant",
22
+ "density",
23
+ "machine_id",
24
+ "difficult_negative_case",
25
+ ]
26
+ # columns in test.csv
27
+ test_columns = [
28
+ "site_id",
29
+ "patient_id",
30
+ "image_id",
31
+ "laterality",
32
+ "view",
33
+ "age",
34
+ "implant",
35
+ "machine_id",
36
+ "prediction_id",
37
+ ]
38
+
39
+ # columns in answers/submission
40
+ submission_columns = ["prediction_id", "cancer"]
41
+
42
+
43
+ DEV = False
44
+
45
+
46
+ def prepare(raw: Path, public: Path, private: Path):
47
+
48
+ old_train = read_csv(raw / "train.csv")
49
+ # work on 5k samples for now, instead of 54k
50
+ if DEV:
51
+ old_train = old_train.sample(5000, random_state=42)
52
+
53
+ # "You can expect roughly 8,000 patients" in the test set
54
+ # so, split on patients. There are 11913 patients in train set
55
+ patient_ids = old_train["patient_id"].unique()
56
+ # Original ratio is 8000/ (8000 + 11913) ~ 0.4
57
+ # We use 0.1 to avoid taking too many samples out of train
58
+ train_patients, test_patients = train_test_split(patient_ids, test_size=0.1, random_state=42)
59
+ train_patients = set(train_patients)
60
+ test_patients = set(test_patients)
61
+
62
+ new_train = old_train[old_train["patient_id"].isin(train_patients)].copy()
63
+ # dont index the `prediction_id` (last col) since its not in train and we need to build it
64
+ # index also the `cancer` column, which we'll drop later for the without_labels version
65
+ answers = old_train[old_train["patient_id"].isin(test_patients)][
66
+ test_columns[:-1] + ["cancer"]
67
+ ].copy()
68
+
69
+ new_train.to_csv(public / "train.csv", index=False)
70
+
71
+ answers["prediction_id"] = answers["patient_id"].astype(str) + "_" + answers["laterality"]
72
+ new_test_without_labels = answers.drop(columns=["cancer"])
73
+ new_test_without_labels.to_csv(public / "test.csv", index=False)
74
+
75
+ # merge multiple prediction_ids for the same patient into one for sample_submission and private test
76
+ answers = answers[submission_columns].copy()
77
+ # just take the first label for each prediction id -- the rest will be identical duplicates
78
+ answers = answers.groupby("prediction_id").first().reset_index()
79
+ answers.to_csv(private / "answers.csv", index=False)
80
+
81
+ sample_submission = answers.copy()
82
+ sample_submission["cancer"] = new_train.cancer.mean() # mean cancer rate in train set
83
+ sample_submission.to_csv(public / "sample_submission.csv", index=False)
84
+
85
+ assert len(sample_submission) == len(
86
+ answers
87
+ ), "sample_submission and test.csv should have the same number of rows"
88
+ assert len(new_test_without_labels) + len(new_train) == len(
89
+ old_train
90
+ ), "The sum of the rows in new_test_without_labels and new_train should be equal to the number of rows in old_train"
91
+ # because of the merging
92
+ assert len(answers) != len(
93
+ new_test_without_labels
94
+ ), "new_test and new_test_without_labels should have different number of rows"
95
+
96
+ assert (
97
+ answers.columns.tolist() == submission_columns
98
+ ), f"answers should have columns {submission_columns}"
99
+ assert (
100
+ sample_submission.columns.tolist() == submission_columns
101
+ ), f"sample_submission should have columns {submission_columns}"
102
+
103
+ assert (
104
+ new_train.columns.tolist() == old_train.columns.tolist()
105
+ ), f"new_train should have columns {old_train.columns.tolist()}, got {new_train.columns.tolist()}"
106
+ assert (
107
+ new_test_without_labels.columns.tolist() == test_columns
108
+ ), f"new_test_without_labels should have columns {test_columns}, got {new_test_without_labels.columns.tolist()}"
109
+
110
+ assert set(new_test_without_labels["patient_id"]).isdisjoint(
111
+ set(new_train["patient_id"])
112
+ ), "new_test_without_labels and new_train should have disjoint patient_ids"
113
+
114
+ # finally, split the images
115
+ (public / "train_images").mkdir(exist_ok=True)
116
+ for patient_id in tqdm(train_patients, total=len(train_patients)):
117
+ patient_id_str = str(patient_id)
118
+ patient_dir = public / "train_images" / patient_id_str
119
+ patient_dir.mkdir(exist_ok=True)
120
+ image_ids = new_train[new_train["patient_id"] == patient_id]["image_id"].to_list()
121
+ for image_id in image_ids:
122
+ shutil.copy(raw / "train_images" / patient_id_str / f"{image_id}.dcm", patient_dir)
123
+
124
+ (public / "test_images").mkdir(exist_ok=True)
125
+ for patient_id in tqdm(test_patients, total=len(test_patients)):
126
+ patient_id_str = str(patient_id)
127
+ patient_dir = public / "test_images" / patient_id_str
128
+ patient_dir.mkdir(exist_ok=True)
129
+ image_ids = new_test_without_labels[new_test_without_labels["patient_id"] == patient_id][
130
+ "image_id"
131
+ ].to_list()
132
+ for image_id in image_ids:
133
+ shutil.copy(raw / "train_images" / patient_id_str / f"{image_id}.dcm", patient_dir)
134
+
135
+ # final checks
136
+ assert len(list((public / "train_images").rglob("*.dcm"))) == len(
137
+ new_train
138
+ ), "Number of images in train_images should be equal to the number of rows in new_train"
139
+ assert len(list((public / "test_images").rglob("*.dcm"))) == len(
140
+ new_test_without_labels
141
+ ), "Number of images in test_images should be equal to the number of rows in new_test_without_labels"