dslighting 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (352) hide show
  1. dslighting/__init__.py +1 -1
  2. dslighting/core/agent.py +78 -62
  3. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/METADATA +1 -1
  4. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/RECORD +352 -7
  5. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/top_level.txt +1 -0
  6. mlebench/README.md +39 -0
  7. mlebench/__init__.py +0 -0
  8. mlebench/cli.py +221 -0
  9. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/grade.py +161 -0
  10. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/mAP_evaluation.py +425 -0
  11. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare.py +483 -0
  12. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare_val.py +719 -0
  13. mlebench/competitions/AI4Code/grade.py +70 -0
  14. mlebench/competitions/AI4Code/prepare.py +84 -0
  15. mlebench/competitions/AI4Code/prepare_val.py +159 -0
  16. mlebench/competitions/__init__.py +0 -0
  17. mlebench/competitions/aerial-cactus-identification/grade.py +11 -0
  18. mlebench/competitions/aerial-cactus-identification/prepare.py +71 -0
  19. mlebench/competitions/aerial-cactus-identification/prepare_val.py +133 -0
  20. mlebench/competitions/alaska2-image-steganalysis/grade.py +136 -0
  21. mlebench/competitions/alaska2-image-steganalysis/prepare.py +88 -0
  22. mlebench/competitions/alaska2-image-steganalysis/prepare_val.py +148 -0
  23. mlebench/competitions/aptos2019-blindness-detection/grade.py +35 -0
  24. mlebench/competitions/aptos2019-blindness-detection/prepare.py +75 -0
  25. mlebench/competitions/aptos2019-blindness-detection/prepare_val.py +123 -0
  26. mlebench/competitions/bike-sharing-demand/__init__.py +0 -0
  27. mlebench/competitions/bike-sharing-demand/grade.py +55 -0
  28. mlebench/competitions/bike-sharing-demand/prepare.py +37 -0
  29. mlebench/competitions/billion-word-imputation/grade.py +37 -0
  30. mlebench/competitions/billion-word-imputation/prepare.py +107 -0
  31. mlebench/competitions/billion-word-imputation/prepare_val.py +179 -0
  32. mlebench/competitions/bms-molecular-translation/grade.py +40 -0
  33. mlebench/competitions/bms-molecular-translation/prepare.py +68 -0
  34. mlebench/competitions/bms-molecular-translation/prepare_val.py +131 -0
  35. mlebench/competitions/cassava-leaf-disease-classification/grade.py +12 -0
  36. mlebench/competitions/cassava-leaf-disease-classification/prepare.py +113 -0
  37. mlebench/competitions/cassava-leaf-disease-classification/prepare_val.py +186 -0
  38. mlebench/competitions/cdiscount-image-classification-challenge/grade.py +11 -0
  39. mlebench/competitions/cdiscount-image-classification-challenge/prepare.py +144 -0
  40. mlebench/competitions/cdiscount-image-classification-challenge/prepare_val.py +205 -0
  41. mlebench/competitions/chaii-hindi-and-tamil-question-answering/grade.py +67 -0
  42. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare.py +31 -0
  43. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare_val.py +94 -0
  44. mlebench/competitions/champs-scalar-coupling/grade.py +60 -0
  45. mlebench/competitions/champs-scalar-coupling/prepare.py +116 -0
  46. mlebench/competitions/champs-scalar-coupling/prepare_val.py +155 -0
  47. mlebench/competitions/conways-reverse-game-of-life-2020/__init__.py +0 -0
  48. mlebench/competitions/conways-reverse-game-of-life-2020/grade.py +40 -0
  49. mlebench/competitions/conways-reverse-game-of-life-2020/prepare.py +41 -0
  50. mlebench/competitions/demand-forecasting-kernels-only/__init__.py +0 -0
  51. mlebench/competitions/demand-forecasting-kernels-only/grade.py +66 -0
  52. mlebench/competitions/demand-forecasting-kernels-only/prepare.py +27 -0
  53. mlebench/competitions/demand_forecasting_kernels_only/__init__.py +0 -0
  54. mlebench/competitions/demand_forecasting_kernels_only/grade.py +66 -0
  55. mlebench/competitions/demand_forecasting_kernels_only/prepare.py +27 -0
  56. mlebench/competitions/denoising-dirty-documents/grade.py +44 -0
  57. mlebench/competitions/denoising-dirty-documents/prepare.py +134 -0
  58. mlebench/competitions/denoising-dirty-documents/prepare_val.py +178 -0
  59. mlebench/competitions/detecting-insults-in-social-commentary/grade.py +11 -0
  60. mlebench/competitions/detecting-insults-in-social-commentary/prepare.py +72 -0
  61. mlebench/competitions/detecting-insults-in-social-commentary/prepare_val.py +128 -0
  62. mlebench/competitions/dog-breed-identification/dogs.py +124 -0
  63. mlebench/competitions/dog-breed-identification/grade.py +42 -0
  64. mlebench/competitions/dog-breed-identification/prepare.py +55 -0
  65. mlebench/competitions/dog-breed-identification/prepare_val.py +104 -0
  66. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/grade.py +43 -0
  67. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare.py +70 -0
  68. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare_val.py +143 -0
  69. mlebench/competitions/ethanol-concentration/grade.py +23 -0
  70. mlebench/competitions/ethanol-concentration/prepare.py +90 -0
  71. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/grade.py +60 -0
  72. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare.py +41 -0
  73. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare_val.py +92 -0
  74. mlebench/competitions/feedback-prize-english-language-learning/__init__.py +0 -0
  75. mlebench/competitions/feedback-prize-english-language-learning/grade.py +60 -0
  76. mlebench/competitions/feedback-prize-english-language-learning/prepare.py +39 -0
  77. mlebench/competitions/freesound-audio-tagging-2019/grade.py +64 -0
  78. mlebench/competitions/freesound-audio-tagging-2019/prepare.py +94 -0
  79. mlebench/competitions/freesound-audio-tagging-2019/prepare_val.py +175 -0
  80. mlebench/competitions/freesound-audio-tagging-2019/vocabulary.py +83 -0
  81. mlebench/competitions/google-quest-challenge/classes.py +32 -0
  82. mlebench/competitions/google-quest-challenge/grade.py +45 -0
  83. mlebench/competitions/google-quest-challenge/prepare.py +58 -0
  84. mlebench/competitions/google-quest-challenge/prepare_val.py +120 -0
  85. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/grade.py +77 -0
  86. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare.py +155 -0
  87. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare_val.py +211 -0
  88. mlebench/competitions/h-and-m-personalized-fashion-recommendations/grade.py +42 -0
  89. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare.py +102 -0
  90. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare_val.py +132 -0
  91. mlebench/competitions/handwriting/grade.py +23 -0
  92. mlebench/competitions/handwriting/prepare.py +179 -0
  93. mlebench/competitions/herbarium-2020-fgvc7/grade.py +34 -0
  94. mlebench/competitions/herbarium-2020-fgvc7/prepare.py +251 -0
  95. mlebench/competitions/herbarium-2020-fgvc7/prepare_val.py +242 -0
  96. mlebench/competitions/herbarium-2021-fgvc8/grade.py +34 -0
  97. mlebench/competitions/herbarium-2021-fgvc8/prepare.py +251 -0
  98. mlebench/competitions/herbarium-2021-fgvc8/prepare_val.py +222 -0
  99. mlebench/competitions/herbarium-2022-fgvc9/grade.py +31 -0
  100. mlebench/competitions/herbarium-2022-fgvc9/prepare.py +233 -0
  101. mlebench/competitions/herbarium-2022-fgvc9/prepare_val.py +213 -0
  102. mlebench/competitions/histopathologic-cancer-detection/grade.py +12 -0
  103. mlebench/competitions/histopathologic-cancer-detection/prepare.py +59 -0
  104. mlebench/competitions/histopathologic-cancer-detection/prepare_val.py +131 -0
  105. mlebench/competitions/hms-harmful-brain-activity-classification/constants.py +9 -0
  106. mlebench/competitions/hms-harmful-brain-activity-classification/grade.py +43 -0
  107. mlebench/competitions/hms-harmful-brain-activity-classification/kaggle_metric_utilities.py +96 -0
  108. mlebench/competitions/hms-harmful-brain-activity-classification/kullback_leibler_divergence.py +118 -0
  109. mlebench/competitions/hms-harmful-brain-activity-classification/prepare.py +121 -0
  110. mlebench/competitions/hms-harmful-brain-activity-classification/prepare_val.py +190 -0
  111. mlebench/competitions/hotel-id-2021-fgvc8/grade.py +41 -0
  112. mlebench/competitions/hotel-id-2021-fgvc8/prepare.py +63 -0
  113. mlebench/competitions/hotel-id-2021-fgvc8/prepare_val.py +132 -0
  114. mlebench/competitions/hubmap-kidney-segmentation/grade.py +62 -0
  115. mlebench/competitions/hubmap-kidney-segmentation/prepare.py +108 -0
  116. mlebench/competitions/hubmap-kidney-segmentation/prepare_val.py +153 -0
  117. mlebench/competitions/icecube-neutrinos-in-deep-ice/grade.py +111 -0
  118. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare.py +127 -0
  119. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare_val.py +183 -0
  120. mlebench/competitions/ili/grade.py +60 -0
  121. mlebench/competitions/ili/prepare.py +99 -0
  122. mlebench/competitions/imet-2020-fgvc7/grade.py +54 -0
  123. mlebench/competitions/imet-2020-fgvc7/prepare.py +77 -0
  124. mlebench/competitions/imet-2020-fgvc7/prepare_val.py +157 -0
  125. mlebench/competitions/inaturalist-2019-fgvc6/grade.py +35 -0
  126. mlebench/competitions/inaturalist-2019-fgvc6/prepare.py +259 -0
  127. mlebench/competitions/inaturalist-2019-fgvc6/prepare_val.py +304 -0
  128. mlebench/competitions/instant-gratification/__init__.py +0 -0
  129. mlebench/competitions/instant-gratification/grade.py +55 -0
  130. mlebench/competitions/instant-gratification/prepare.py +25 -0
  131. mlebench/competitions/instant_gratification/__init__.py +0 -0
  132. mlebench/competitions/instant_gratification/grade.py +55 -0
  133. mlebench/competitions/instant_gratification/prepare.py +25 -0
  134. mlebench/competitions/invasive-species-monitoring/grade.py +11 -0
  135. mlebench/competitions/invasive-species-monitoring/prepare.py +97 -0
  136. mlebench/competitions/invasive-species-monitoring/prepare_val.py +164 -0
  137. mlebench/competitions/iwildcam-2019-fgvc6/grade.py +44 -0
  138. mlebench/competitions/iwildcam-2019-fgvc6/prepare.py +118 -0
  139. mlebench/competitions/iwildcam-2019-fgvc6/prepare_val.py +194 -0
  140. mlebench/competitions/iwildcam-2020-fgvc7/grade.py +11 -0
  141. mlebench/competitions/iwildcam-2020-fgvc7/prepare.py +164 -0
  142. mlebench/competitions/iwildcam-2020-fgvc7/prepare_val.py +245 -0
  143. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/classes.py +1 -0
  144. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/grade.py +54 -0
  145. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare.py +42 -0
  146. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare_val.py +88 -0
  147. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/grade.py +153 -0
  148. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare.py +36 -0
  149. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare_val.py +117 -0
  150. mlebench/competitions/kuzushiji-recognition/grade.py +58 -0
  151. mlebench/competitions/kuzushiji-recognition/kuzushiji_metric.py +118 -0
  152. mlebench/competitions/kuzushiji-recognition/prepare.py +92 -0
  153. mlebench/competitions/kuzushiji-recognition/prepare_val.py +149 -0
  154. mlebench/competitions/leaf-classification/classes.py +101 -0
  155. mlebench/competitions/leaf-classification/grade.py +44 -0
  156. mlebench/competitions/leaf-classification/prepare.py +60 -0
  157. mlebench/competitions/leaf-classification/prepare_val.py +116 -0
  158. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/grade.py +44 -0
  159. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare.py +51 -0
  160. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare_val.py +96 -0
  161. mlebench/competitions/liverpool-ion-switching/__init__.py +0 -0
  162. mlebench/competitions/liverpool-ion-switching/grade.py +52 -0
  163. mlebench/competitions/liverpool-ion-switching/prepare.py +27 -0
  164. mlebench/competitions/liverpool_ion_switching/__init__.py +0 -0
  165. mlebench/competitions/liverpool_ion_switching/grade.py +52 -0
  166. mlebench/competitions/liverpool_ion_switching/prepare.py +27 -0
  167. mlebench/competitions/lmsys-chatbot-arena/grade.py +63 -0
  168. mlebench/competitions/lmsys-chatbot-arena/prepare.py +52 -0
  169. mlebench/competitions/lmsys-chatbot-arena/prepare_val.py +115 -0
  170. mlebench/competitions/mcm_2024_c_test/grade.py +107 -0
  171. mlebench/competitions/mcm_2024_c_test/prepare.py +2 -0
  172. mlebench/competitions/ml2021spring-hw2/grade.py +11 -0
  173. mlebench/competitions/ml2021spring-hw2/prepare.py +58 -0
  174. mlebench/competitions/ml2021spring-hw2/prepare_val.py +135 -0
  175. mlebench/competitions/mlsp-2013-birds/grade.py +11 -0
  176. mlebench/competitions/mlsp-2013-birds/prepare.py +182 -0
  177. mlebench/competitions/mlsp-2013-birds/prepare_val.py +241 -0
  178. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/grade.py +11 -0
  179. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare.py +58 -0
  180. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare_val.py +120 -0
  181. mlebench/competitions/multi-modal-gesture-recognition/grade.py +58 -0
  182. mlebench/competitions/multi-modal-gesture-recognition/prepare.py +85 -0
  183. mlebench/competitions/multi-modal-gesture-recognition/prepare_val.py +139 -0
  184. mlebench/competitions/my-custom-task-01/prepare.py +2 -0
  185. mlebench/competitions/new-my-task-01/prepare.py +2 -0
  186. mlebench/competitions/new-my-task-03/grade.py +107 -0
  187. mlebench/competitions/new-my-task-03/prepare.py +2 -0
  188. mlebench/competitions/new-york-city-taxi-fare-prediction/grade.py +28 -0
  189. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare.py +44 -0
  190. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare_val.py +89 -0
  191. mlebench/competitions/nfl-player-contact-detection/grade.py +36 -0
  192. mlebench/competitions/nfl-player-contact-detection/prepare.py +101 -0
  193. mlebench/competitions/nfl-player-contact-detection/prepare_val.py +186 -0
  194. mlebench/competitions/nomad2018-predict-transparent-conductors/grade.py +47 -0
  195. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare.py +77 -0
  196. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare_val.py +144 -0
  197. mlebench/competitions/osic-pulmonary-fibrosis-progression/grade.py +74 -0
  198. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare.py +95 -0
  199. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare_val.py +167 -0
  200. mlebench/competitions/paddy-disease-classification/grade.py +35 -0
  201. mlebench/competitions/paddy-disease-classification/prepare.py +69 -0
  202. mlebench/competitions/paddy-disease-classification/prepare_val.py +122 -0
  203. mlebench/competitions/petfinder-pawpularity-score/grade.py +41 -0
  204. mlebench/competitions/petfinder-pawpularity-score/prepare.py +76 -0
  205. mlebench/competitions/petfinder-pawpularity-score/prepare_val.py +154 -0
  206. mlebench/competitions/plant-pathology-2020-fgvc7/grade.py +41 -0
  207. mlebench/competitions/plant-pathology-2020-fgvc7/prepare.py +74 -0
  208. mlebench/competitions/plant-pathology-2020-fgvc7/prepare_val.py +160 -0
  209. mlebench/competitions/plant-pathology-2021-fgvc8/grade.py +54 -0
  210. mlebench/competitions/plant-pathology-2021-fgvc8/prepare.py +65 -0
  211. mlebench/competitions/plant-pathology-2021-fgvc8/prepare_val.py +130 -0
  212. mlebench/competitions/plant-seedlings-classification/grade.py +39 -0
  213. mlebench/competitions/plant-seedlings-classification/prepare.py +91 -0
  214. mlebench/competitions/plant-seedlings-classification/prepare_val.py +158 -0
  215. mlebench/competitions/playground-series-s3e1/__init__.py +0 -0
  216. mlebench/competitions/playground-series-s3e1/grade.py +52 -0
  217. mlebench/competitions/playground-series-s3e1/prepare.py +25 -0
  218. mlebench/competitions/playground-series-s3e11/__init__.py +0 -0
  219. mlebench/competitions/playground-series-s3e11/grade.py +55 -0
  220. mlebench/competitions/playground-series-s3e11/prepare.py +25 -0
  221. mlebench/competitions/playground-series-s3e18/grade.py +39 -0
  222. mlebench/competitions/playground-series-s3e18/prepare.py +36 -0
  223. mlebench/competitions/playground-series-s3e18/prepare_val.py +89 -0
  224. mlebench/competitions/playground_series_s3e1/__init__.py +0 -0
  225. mlebench/competitions/playground_series_s3e1/grade.py +52 -0
  226. mlebench/competitions/playground_series_s3e1/prepare.py +25 -0
  227. mlebench/competitions/playground_series_s3e11/__init__.py +0 -0
  228. mlebench/competitions/playground_series_s3e11/grade.py +55 -0
  229. mlebench/competitions/playground_series_s3e11/prepare.py +25 -0
  230. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/grade.py +44 -0
  231. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare.py +68 -0
  232. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare_val.py +146 -0
  233. mlebench/competitions/random-acts-of-pizza/grade.py +14 -0
  234. mlebench/competitions/random-acts-of-pizza/prepare.py +80 -0
  235. mlebench/competitions/random-acts-of-pizza/prepare_val.py +144 -0
  236. mlebench/competitions/ranzcr-clip-catheter-line-classification/classes.py +11 -0
  237. mlebench/competitions/ranzcr-clip-catheter-line-classification/grade.py +31 -0
  238. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare.py +53 -0
  239. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare_val.py +113 -0
  240. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/grade.py +124 -0
  241. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare.py +219 -0
  242. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare_val.py +257 -0
  243. mlebench/competitions/rsna-breast-cancer-detection/grade.py +65 -0
  244. mlebench/competitions/rsna-breast-cancer-detection/prepare.py +141 -0
  245. mlebench/competitions/rsna-breast-cancer-detection/prepare_val.py +201 -0
  246. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/grade.py +13 -0
  247. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare.py +47 -0
  248. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare_val.py +97 -0
  249. mlebench/competitions/santander-customer-satisfaction/grade.py +10 -0
  250. mlebench/competitions/santander-customer-satisfaction/prepare.py +41 -0
  251. mlebench/competitions/sciencebench-001-clintox-nn/__init__.py +0 -0
  252. mlebench/competitions/sciencebench-001-clintox-nn/grade.py +56 -0
  253. mlebench/competitions/sciencebench-001-clintox-nn/prepare.py +75 -0
  254. mlebench/competitions/sciencebench-015-aai/grade.py +37 -0
  255. mlebench/competitions/sciencebench-015-aai/prepare.py +102 -0
  256. mlebench/competitions/sciencebench-051-brain-blood-qsar/grade.py +58 -0
  257. mlebench/competitions/sciencebench-051-brain-blood-qsar/prepare.py +69 -0
  258. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/grade.py +55 -0
  259. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/prepare.py +88 -0
  260. mlebench/competitions/see-click-predict-fix/__init__.py +0 -0
  261. mlebench/competitions/see-click-predict-fix/grade.py +66 -0
  262. mlebench/competitions/see-click-predict-fix/prepare.py +25 -0
  263. mlebench/competitions/see_click_predict_fix/__init__.py +0 -0
  264. mlebench/competitions/see_click_predict_fix/grade.py +66 -0
  265. mlebench/competitions/see_click_predict_fix/prepare.py +25 -0
  266. mlebench/competitions/seti-breakthrough-listen/grade.py +11 -0
  267. mlebench/competitions/seti-breakthrough-listen/prepare.py +71 -0
  268. mlebench/competitions/seti-breakthrough-listen/prepare_val.py +159 -0
  269. mlebench/competitions/siim-covid19-detection/grade.py +194 -0
  270. mlebench/competitions/siim-covid19-detection/prepare.py +123 -0
  271. mlebench/competitions/siim-covid19-detection/prepare_val.py +164 -0
  272. mlebench/competitions/siim-isic-melanoma-classification/grade.py +11 -0
  273. mlebench/competitions/siim-isic-melanoma-classification/prepare.py +127 -0
  274. mlebench/competitions/siim-isic-melanoma-classification/prepare_val.py +158 -0
  275. mlebench/competitions/smartphone-decimeter-2022/grade.py +55 -0
  276. mlebench/competitions/smartphone-decimeter-2022/notebook.py +86 -0
  277. mlebench/competitions/smartphone-decimeter-2022/prepare.py +143 -0
  278. mlebench/competitions/smartphone-decimeter-2022/prepare_val.py +199 -0
  279. mlebench/competitions/spaceship-titanic/grade.py +11 -0
  280. mlebench/competitions/spaceship-titanic/prepare.py +23 -0
  281. mlebench/competitions/spaceship-titanic/prepare_val.py +61 -0
  282. mlebench/competitions/spooky-author-identification/classes.py +1 -0
  283. mlebench/competitions/spooky-author-identification/grade.py +38 -0
  284. mlebench/competitions/spooky-author-identification/prepare.py +40 -0
  285. mlebench/competitions/spooky-author-identification/prepare_val.py +78 -0
  286. mlebench/competitions/stanford-covid-vaccine/grade.py +65 -0
  287. mlebench/competitions/stanford-covid-vaccine/prepare.py +129 -0
  288. mlebench/competitions/stanford-covid-vaccine/prepare_val.py +199 -0
  289. mlebench/competitions/statoil-iceberg-classifier-challenge/grade.py +41 -0
  290. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare.py +105 -0
  291. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare_val.py +157 -0
  292. mlebench/competitions/tabular-playground-series-dec-2021/grade.py +11 -0
  293. mlebench/competitions/tabular-playground-series-dec-2021/prepare.py +39 -0
  294. mlebench/competitions/tabular-playground-series-dec-2021/prepare_val.py +99 -0
  295. mlebench/competitions/tabular-playground-series-may-2022/grade.py +9 -0
  296. mlebench/competitions/tabular-playground-series-may-2022/prepare.py +56 -0
  297. mlebench/competitions/tabular-playground-series-may-2022/prepare_val.py +116 -0
  298. mlebench/competitions/tensorflow-speech-recognition-challenge/grade.py +11 -0
  299. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare.py +90 -0
  300. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare_val.py +148 -0
  301. mlebench/competitions/tensorflow2-question-answering/grade.py +122 -0
  302. mlebench/competitions/tensorflow2-question-answering/prepare.py +122 -0
  303. mlebench/competitions/tensorflow2-question-answering/prepare_val.py +187 -0
  304. mlebench/competitions/text-normalization-challenge-english-language/grade.py +49 -0
  305. mlebench/competitions/text-normalization-challenge-english-language/prepare.py +115 -0
  306. mlebench/competitions/text-normalization-challenge-english-language/prepare_val.py +213 -0
  307. mlebench/competitions/text-normalization-challenge-russian-language/grade.py +49 -0
  308. mlebench/competitions/text-normalization-challenge-russian-language/prepare.py +113 -0
  309. mlebench/competitions/text-normalization-challenge-russian-language/prepare_val.py +165 -0
  310. mlebench/competitions/tgs-salt-identification-challenge/grade.py +144 -0
  311. mlebench/competitions/tgs-salt-identification-challenge/prepare.py +158 -0
  312. mlebench/competitions/tgs-salt-identification-challenge/prepare_val.py +166 -0
  313. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/grade.py +11 -0
  314. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare.py +95 -0
  315. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare_val.py +141 -0
  316. mlebench/competitions/tmdb-box-office-prediction/__init__.py +0 -0
  317. mlebench/competitions/tmdb-box-office-prediction/grade.py +55 -0
  318. mlebench/competitions/tmdb-box-office-prediction/prepare.py +35 -0
  319. mlebench/competitions/tweet-sentiment-extraction/grade.py +67 -0
  320. mlebench/competitions/tweet-sentiment-extraction/prepare.py +36 -0
  321. mlebench/competitions/tweet-sentiment-extraction/prepare_val.py +106 -0
  322. mlebench/competitions/us-patent-phrase-to-phrase-matching/grade.py +31 -0
  323. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare.py +33 -0
  324. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare_val.py +71 -0
  325. mlebench/competitions/utils.py +266 -0
  326. mlebench/competitions/uw-madison-gi-tract-image-segmentation/grade.py +158 -0
  327. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare.py +139 -0
  328. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare_val.py +193 -0
  329. mlebench/competitions/ventilator-pressure-prediction/__init__.py +0 -0
  330. mlebench/competitions/ventilator-pressure-prediction/grade.py +52 -0
  331. mlebench/competitions/ventilator-pressure-prediction/prepare.py +27 -0
  332. mlebench/competitions/ventilator-pressure-prediction/prepare_val.py +142 -0
  333. mlebench/competitions/ventilator_pressure_prediction/__init__.py +0 -0
  334. mlebench/competitions/ventilator_pressure_prediction/grade.py +52 -0
  335. mlebench/competitions/ventilator_pressure_prediction/prepare.py +27 -0
  336. mlebench/competitions/vesuvius-challenge-ink-detection/grade.py +97 -0
  337. mlebench/competitions/vesuvius-challenge-ink-detection/prepare.py +122 -0
  338. mlebench/competitions/vesuvius-challenge-ink-detection/prepare_val.py +170 -0
  339. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/grade.py +220 -0
  340. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare.py +129 -0
  341. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare_val.py +204 -0
  342. mlebench/competitions/whale-categorization-playground/grade.py +41 -0
  343. mlebench/competitions/whale-categorization-playground/prepare.py +103 -0
  344. mlebench/competitions/whale-categorization-playground/prepare_val.py +196 -0
  345. mlebench/data.py +420 -0
  346. mlebench/grade.py +209 -0
  347. mlebench/grade_helpers.py +235 -0
  348. mlebench/metrics.py +75 -0
  349. mlebench/registry.py +332 -0
  350. mlebench/utils.py +346 -0
  351. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/WHEEL +0 -0
  352. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,719 @@
1
+ import json
2
+ import shutil
3
+ from pathlib import Path
4
+
5
+ import pandas as pd
6
+ from sklearn.model_selection import train_test_split
7
+ from tqdm import tqdm
8
+
9
+ from mlebench.utils import get_logger
10
+
11
+ logger = get_logger(__name__)
12
+
13
+
14
+ def prepare(raw: Path, public: Path, private: Path):
15
+ """
16
+ Splits the raw train data into new train/test splits.
17
+
18
+ There isn't detailed documentation on how the train/test split of the raw data was made, but according to
19
+ this post https://www.kaggle.com/competitions/3d-object-detection-for-autonomous-vehicles/discussion/133895:
20
+ - It consists of a raw camera, lidar data, and HD semantic map.
21
+ - 180 scenes, 25s each
22
+ - 638,000 2D and 3D annotations over 18,000 objects
23
+ - The dataset had nine classes with a large class imbalance.
24
+ The original train/test split:
25
+ - Train set 40% (train.csv has 22680 rows)
26
+ - Test set: public 30%, private 30% (sample submission has 27468 rows)
27
+
28
+ Since there are 180 scenes and >>180 samples, each sample is not independent; samples within a scene are highly correlated
29
+ so we can't randomly split samples at an individual level. Instead, we split by scenes.
30
+ In practice, scenes are identified by "log tokens" in the data, corresponding to a single log file (listed in log.json).
31
+ (We also verified that the log IDs in the raw train/test splits are disjoint, which supports this choice.)
32
+
33
+ ## What's in the dataset?
34
+ ```
35
+ $ ls raw/
36
+ sample_submission.csv test_data/ test_images/ test_lidar/ test_maps/
37
+ train.csv train_data/ train_images/ train_lidar/ train_maps/
38
+
39
+ $ ls raw/train_data/
40
+ attribute.json category.json instance.json map.json sample_annotation.json scene.json visibility.json
41
+ calibrated_sensor.json ego_pose.json log.json sample.json sample_data.json sensor.json
42
+ $ ls raw/train_images/ | wc -l
43
+ 158757
44
+ $ ls raw/train_lidar | wc -l
45
+ 30744
46
+
47
+ $ ls raw/test_data/
48
+ attribute.json calibrated_sensor.json category.json ego_pose.json log.json map.json sample.json sample_data.json scene.json sensor.json visibility.json
49
+ # test_data/ omits the instance.json and sample_annotation.json files
50
+ $ ls raw/test_images/ | wc -l
51
+ 192276
52
+ $ ls raw/test_lidar/ | wc -l
53
+ 27468
54
+ """
55
+ DEV_MODE = False
56
+ (public / "test_data").mkdir(parents=True, exist_ok=True)
57
+ (public / "train_data").mkdir(parents=True, exist_ok=True)
58
+ private.mkdir(parents=True, exist_ok=True)
59
+
60
+ """
61
+ log.json
62
+ """
63
+ # Start the split at the log level, and the rest follows
64
+ with open(raw / "train_data" / "log.json") as f:
65
+ logs = json.load(f)
66
+ log_ids = [log["token"] for log in logs]
67
+ assert len(log_ids) == len(set(log_ids)), "Log IDs must be unique"
68
+ logger.info(f"Found {len(log_ids)} logs")
69
+ # Split the logs into train/test
70
+ # previous ratio had 180 train logs and 218 test logs; we'll split the 180 train samples into 80% new train and 20% new test
71
+ # (trying not to reduce the availibility of training data, but need a large-ish set since there are 9 object classes)
72
+ train_log_ids, test_logs_ids = train_test_split(log_ids, test_size=0.2, random_state=0)
73
+ logger.info(f"Train logs: {len(train_log_ids)}, Test logs: {len(test_logs_ids)}")
74
+ with open(public / "train_data" / "log.json", "w") as f:
75
+ json.dump([log for log in logs if log["token"] in train_log_ids], f)
76
+ with open(public / "test_data" / "log.json", "w") as f:
77
+ json.dump([log for log in logs if log["token"] in test_logs_ids], f)
78
+
79
+ """
80
+ sample.json
81
+ """
82
+ # Create train/test sample splits following the log split
83
+ with open(raw / "train_data" / "sample.json") as f:
84
+ samples = json.load(f)
85
+ logger.info(f"Found {len(samples)} samples")
86
+ train_samples = [sample for sample in samples if sample["scene_token"] in train_log_ids]
87
+ test_samples = [sample for sample in samples if sample["scene_token"] in test_logs_ids]
88
+ logger.info(f"New train samples: {len(train_samples)}, new test samples: {len(test_samples)}")
89
+ assert len(train_samples) + len(test_samples) == len(
90
+ samples
91
+ ), f"New train ({len(train_samples)}) and test ({len(test_samples)}) samples must cover all samples ({len(samples)})"
92
+ with open(public / "train_data" / "sample.json", "w") as f:
93
+ json.dump(train_samples, f)
94
+ with open(public / "test_data" / "sample.json", "w") as f:
95
+ json.dump(test_samples, f)
96
+
97
+ """
98
+ Make train.csv
99
+ """
100
+ # train.csv has columns `Id` and `PredictionString`, with `PredictionString` in the following format: `center_x center_y center_z width length height yaw class_name`
101
+ with open(raw / "train.csv") as f:
102
+ train_df = pd.read_csv(f)
103
+ logger.info(f"Found {len(train_df)} train rows")
104
+ new_train_df = train_df[train_df["Id"].isin([sample["token"] for sample in train_samples])]
105
+ new_test_df = train_df[train_df["Id"].isin([sample["token"] for sample in test_samples])]
106
+ logger.info(f"Train rows: {len(new_train_df)}, Test rows: {len(new_test_df)}")
107
+ assert len(new_train_df) + len(new_test_df) == len(
108
+ train_df
109
+ ), f"New train ({len(new_train_df)}) and test ({len(new_test_df)}) annotations must cover all annotations ({len(train_df)})"
110
+ assert len(new_train_df) == len(
111
+ train_samples
112
+ ), f"New train rows ({len(new_train_df)}) must match train samples ({len(train_samples)})"
113
+ assert len(new_test_df) == len(
114
+ test_samples
115
+ ), f"New test rows ({len(new_test_df)}) must match test samples ({len(test_samples)})"
116
+ new_train_df.to_csv(public / "train.csv", index=False)
117
+
118
+ """
119
+ Make private test.csv
120
+ """
121
+ # test.csv is basically new_test_df, but the "PredictionString" column needs to have a "confidence" value added
122
+ # so the format becomes: `confidence center_x center_y center_z width length height yaw class_name`
123
+ def add_confidence(pred_string):
124
+ pred_tokens = pred_string.split(" ")
125
+ assert (
126
+ len(pred_tokens) % 8 == 0
127
+ ), f"Expected 8 tokens per object, but got {len(pred_tokens)}"
128
+ new_pred_tokens = []
129
+ for i in range(0, len(pred_tokens), 8):
130
+ new_pred_tokens.extend(["1.0"] + pred_tokens[i : i + 8])
131
+ return " ".join(new_pred_tokens)
132
+
133
+ # Apply the function to the entire 'PredictionString' column
134
+ new_test_df["PredictionString"] = new_test_df["PredictionString"].apply(add_confidence)
135
+ new_test_df.to_csv(private / "test.csv", index=False)
136
+
137
+ """
138
+ Make sample_submission.csv
139
+ """
140
+ # sample submission is the same as test.csv but with empty prediction strings
141
+ sample_submission = new_test_df[["Id"]].copy()
142
+ sample_submission["PredictionString"] = ""
143
+ sample_submission.to_csv(public / "sample_submission.csv", index=False)
144
+
145
+ """
146
+ Split sample_data.json
147
+ """
148
+ # sample_data.json is a list of all images and lidar files, and each entry has a `sample_token` field that identifies which sample it belongs to
149
+ with open(raw / "train_data" / "sample_data.json") as f:
150
+ sample_data = json.load(f)
151
+ logger.info(f"Found {len(sample_data)} train sample data")
152
+ new_train_sample_data, new_test_sample_data = [], []
153
+ for sample_datum in sample_data:
154
+ sample_token = sample_datum["sample_token"]
155
+ if sample_token in new_train_df["Id"].values:
156
+ new_train_sample_data.append(sample_datum)
157
+ elif sample_token in new_test_df["Id"].values:
158
+ new_test_sample_data.append(sample_datum)
159
+ else:
160
+ raise ValueError(
161
+ f"Sample data token {sample_token} doesn't belong to either new train or new test set"
162
+ )
163
+ logger.info(
164
+ f"New train sample data: {len(new_train_sample_data)}, new test sample data: {len(new_test_sample_data)}"
165
+ )
166
+ assert len(new_train_sample_data) + len(new_test_sample_data) == len(
167
+ sample_data
168
+ ), f"New train ({len(new_train_sample_data)}) and test ({len(new_test_sample_data)}) sample data must cover all sample data ({len(sample_data)})"
169
+ with open(public / "train_data" / "sample_data.json", "w") as f:
170
+ json.dump(new_train_sample_data, f)
171
+ with open(public / "test_data" / "sample_data.json", "w") as f:
172
+ json.dump(new_test_sample_data, f)
173
+
174
+ """
175
+ Copy over maps
176
+ """
177
+ # There is only one map which is identical in both raw train/test so no need to modify, just copy over
178
+ # $ diff raw/test_maps/map_raster_palo_alto.png raw/train_maps/map_raster_palo_alto.png # -> no output
179
+ (public / "test_maps").mkdir(parents=True, exist_ok=True)
180
+ (public / "train_maps").mkdir(parents=True, exist_ok=True)
181
+ shutil.copyfile(
182
+ src=raw / "train_maps" / "map_raster_palo_alto.png",
183
+ dst=public / "test_maps" / "map_raster_palo_alto.png",
184
+ )
185
+ shutil.copyfile(
186
+ src=raw / "train_maps" / "map_raster_palo_alto.png",
187
+ dst=public / "train_maps" / "map_raster_palo_alto.png",
188
+ )
189
+
190
+ """
191
+ Copy attribute.json
192
+ """
193
+ # attribute.json is a list of object states, there are 18 attributes in the train set including "object_action_walking", "object_action_parked", etc.
194
+ # The raw test set has an attribute.json file drawn from the same set of attributes, but only has 17 attributes (whichever attributes
195
+ # are present in the test set.) For simplicity, we'll just copy the full list of 18 attributes in both the new train and new test sets.
196
+ shutil.copyfile(
197
+ src=raw / "train_data" / "attribute.json", dst=public / "train_data" / "attribute.json"
198
+ )
199
+ shutil.copyfile(
200
+ src=raw / "train_data" / "attribute.json", dst=public / "test_data" / "attribute.json"
201
+ )
202
+
203
+ """
204
+ Split calibrated_sensor.json
205
+ """
206
+ # calibrated_sensor.json is a list of sensor calibration parameters corresponding to the setup of the sensor at the time each sample was taken.
207
+ # This file will be split following the sample_data split (each sample_datum has a `calibrated_sensor_token`)
208
+ with open(raw / "train_data" / "calibrated_sensor.json") as f:
209
+ calibrated_sensors = json.load(f)
210
+ calibration_by_calibrated_sensor_token = {cal["token"]: cal for cal in calibrated_sensors}
211
+ new_train_calibrated_sensors, new_test_calibrated_sensors = [], []
212
+ for sample_datum in new_train_sample_data:
213
+ calibrated_sensor_token = sample_datum["calibrated_sensor_token"]
214
+ if calibrated_sensor_token in [cal["token"] for cal in new_train_calibrated_sensors]:
215
+ continue # Each calibrated sensor is used by multiple samples, we don't need to add it multiple times
216
+ new_train_calibrated_sensors.append(
217
+ calibration_by_calibrated_sensor_token[calibrated_sensor_token]
218
+ )
219
+ for sample_datum in new_test_sample_data:
220
+ calibrated_sensor_token = sample_datum["calibrated_sensor_token"]
221
+ if calibrated_sensor_token in [cal["token"] for cal in new_test_calibrated_sensors]:
222
+ continue # Each calibrated sensor is used by multiple samples, we don't need to add it multiple times
223
+ new_test_calibrated_sensors.append(
224
+ calibration_by_calibrated_sensor_token[calibrated_sensor_token]
225
+ )
226
+ logger.info(
227
+ f"New train calibrated sensors: {len(new_train_calibrated_sensors)}, new test calibrated sensors: {len(new_test_calibrated_sensors)}"
228
+ )
229
+ assert len(
230
+ set([cal["token"] for cal in new_train_calibrated_sensors + new_test_calibrated_sensors])
231
+ ) == len(
232
+ calibrated_sensors
233
+ ), f"New train and test calibrated sensors must cover all calibrated sensors ({len(calibrated_sensors)})"
234
+ with open(public / "train_data" / "calibrated_sensor.json", "w") as f:
235
+ json.dump(new_train_calibrated_sensors, f)
236
+ with open(public / "test_data" / "calibrated_sensor.json", "w") as f:
237
+ json.dump(new_test_calibrated_sensors, f)
238
+
239
+ """
240
+ Copy category.json
241
+ """
242
+ # category.json is the list of 9 object classes, and is the same for train/test
243
+ shutil.copyfile(
244
+ src=raw / "train_data" / "category.json", dst=public / "train_data" / "category.json"
245
+ )
246
+ shutil.copyfile(
247
+ src=raw / "train_data" / "category.json", dst=public / "test_data" / "category.json"
248
+ )
249
+
250
+ """
251
+ Split ego_pose.json
252
+ """
253
+ # ego_pose.json is a list of vehicle poses, and will be split following the sample_data split
254
+ with open(raw / "train_data" / "ego_pose.json") as f:
255
+ ego_poses = json.load(f)
256
+ ego_pose_by_ego_pose_token = {ego["token"]: ego for ego in ego_poses}
257
+ new_train_ego_poses, new_test_ego_poses = [], []
258
+ for sample_datum in new_train_sample_data:
259
+ ego_pose_token = sample_datum["ego_pose_token"]
260
+ new_train_ego_poses.append(ego_pose_by_ego_pose_token[ego_pose_token])
261
+ for sample_datum in new_test_sample_data:
262
+ ego_pose_token = sample_datum["ego_pose_token"]
263
+ new_test_ego_poses.append(ego_pose_by_ego_pose_token[ego_pose_token])
264
+ logger.info(
265
+ f"New train ego poses: {len(new_train_ego_poses)}, new test ego poses: {len(new_test_ego_poses)}"
266
+ )
267
+ assert len(set([ego["token"] for ego in new_train_ego_poses + new_test_ego_poses])) == len(
268
+ ego_poses
269
+ ), f"New train and test ego poses must cover all ego poses ({len(ego_poses)})"
270
+ with open(public / "train_data" / "ego_pose.json", "w") as f:
271
+ json.dump(new_train_ego_poses, f)
272
+ with open(public / "test_data" / "ego_pose.json", "w") as f:
273
+ json.dump(new_test_ego_poses, f)
274
+
275
+ """
276
+ Create map.json
277
+ """
278
+ # map.json is the list of maps, and a list of logs that used those maps. But in the raw dataset, we only have one map,
279
+ # so this ends up being just a list of one map, which has a sublist of all the logs in the split.
280
+ # [{"log_tokens": [...], "category": "semantic_prior", "filename": "maps/map_raster_palo_alto.png", "token": "53992ee3023e5494b90c316c183be829"}]
281
+ with open(raw / "train_data" / "map.json") as f:
282
+ maps = json.load(f)
283
+ assert len(maps) == 1, "Expected only one map in the raw dataset"
284
+ # Just replace the list of "log_tokens" with the new train and test log IDs
285
+ new_train_maps = maps.copy()
286
+ new_train_maps[0]["log_tokens"] = train_log_ids
287
+ new_test_maps = maps.copy()
288
+ new_test_maps[0]["log_tokens"] = test_logs_ids
289
+ with open(public / "train_data" / "map.json", "w") as f:
290
+ json.dump(new_train_maps, f)
291
+ with open(public / "test_data" / "map.json", "w") as f:
292
+ json.dump(new_test_maps, f)
293
+
294
+ """
295
+ Split scene.json
296
+ """
297
+ # scene.json is a list of scenes corresponding exactly to each log file. The scenes describe the first and last
298
+ # samples in each scene, as well as how many samples are in each scene.
299
+ # We'll split this following the log split.
300
+ with open(raw / "train_data" / "scene.json") as f:
301
+ scenes = json.load(f)
302
+ logger.info(f"Found {len(scenes)} scenes")
303
+ new_train_scenes, new_test_scenes = [], []
304
+ for scene in scenes:
305
+ log_token = scene["log_token"]
306
+ if log_token in train_log_ids:
307
+ new_train_scenes.append(scene)
308
+ elif log_token in test_logs_ids:
309
+ new_test_scenes.append(scene)
310
+ else:
311
+ raise ValueError(
312
+ f"Scene log token {log_token} doesn't belong to either new train or new test set"
313
+ )
314
+ logger.info(
315
+ f"New train scenes: {len(new_train_scenes)}, new test scenes: {len(new_test_scenes)}"
316
+ )
317
+ assert len(new_train_scenes) + len(new_test_scenes) == len(
318
+ scenes
319
+ ), f"New train ({len(new_train_scenes)}) and test ({len(new_test_scenes)}) scenes must cover all scenes ({len(scenes)})"
320
+ with open(public / "train_data" / "scene.json", "w") as f:
321
+ json.dump(new_train_scenes, f)
322
+ with open(public / "test_data" / "scene.json", "w") as f:
323
+ json.dump(new_test_scenes, f)
324
+
325
+ """
326
+ Copy sensor.json
327
+ """
328
+ # sensor.json is a list of sensors used in the dataset (10 sensors in the raw train set).
329
+ # For simplicity, we'll just copy the full list of sensors in both the new train and new test sets.
330
+ shutil.copyfile(
331
+ src=raw / "train_data" / "sensor.json", dst=public / "train_data" / "sensor.json"
332
+ )
333
+ shutil.copyfile(
334
+ src=raw / "train_data" / "sensor.json", dst=public / "test_data" / "sensor.json"
335
+ )
336
+
337
+ """
338
+ Copy visibility.json
339
+ """
340
+ # visibility.json is a list of 4 visibility classes describing how visible an annotated object is in a given sample.
341
+ # Both train and test use the same visibility classes, so we'll just copy these to the new train and new test sets.
342
+ shutil.copyfile(
343
+ src=raw / "train_data" / "visibility.json", dst=public / "train_data" / "visibility.json"
344
+ )
345
+ shutil.copyfile(
346
+ src=raw / "train_data" / "visibility.json", dst=public / "test_data" / "visibility.json"
347
+ )
348
+
349
+ """
350
+ Split sample_annotation.json
351
+ """
352
+ # sample_annotation.json is the full list of object annotations (bounding boxes) from all samples,
353
+ # and will be split following the sample split.
354
+ with open(raw / "train_data" / "sample_annotation.json") as f:
355
+ sample_annotations = json.load(f)
356
+ logger.info(f"Found {len(sample_annotations)} train sample annotations")
357
+ new_train_sample_annotations, new_test_sample_annotations = [], []
358
+ for sample_annotation in sample_annotations:
359
+ sample_token = sample_annotation["sample_token"]
360
+ if sample_token in new_train_df["Id"].values:
361
+ new_train_sample_annotations.append(sample_annotation)
362
+ elif sample_token in new_test_df["Id"].values:
363
+ new_test_sample_annotations.append(sample_annotation)
364
+ else:
365
+ raise ValueError(
366
+ f"Sample annotation token {sample_token} doesn't belong to either new train or new test set"
367
+ )
368
+ logger.info(
369
+ f"New train sample annotations: {len(new_train_sample_annotations)}, new test sample annotations: {len(new_test_sample_annotations)}"
370
+ )
371
+ assert len(new_train_sample_annotations) + len(new_test_sample_annotations) == len(
372
+ sample_annotations
373
+ ), f"New train ({len(new_train_sample_annotations)}) and test ({len(new_test_sample_annotations)}) sample annotations must cover all sample annotations ({len(sample_annotations)})"
374
+ with open(public / "train_data" / "sample_annotation.json", "w") as f:
375
+ json.dump(new_train_sample_annotations, f)
376
+ # NOTE: don't export (public / "test_data" / "sample_annotation.json") since the test set doesn't provide sample annotations
377
+
378
+ """
379
+ Split instance.json
380
+ """
381
+ # instance.json is a list of object instances (e.g. the same pedestrian appearing in contiguous frames),
382
+ # and will be split following the sample_annotation.json split
383
+ with open(raw / "train_data" / "instance.json") as f:
384
+ instances = json.load(f)
385
+ logger.info(f"Found {len(instances)} train instances")
386
+ new_train_instance_ids = set([sa["instance_token"] for sa in new_train_sample_annotations])
387
+ new_test_instance_ids = set([sa["instance_token"] for sa in new_test_sample_annotations])
388
+ new_train_instances, new_test_instances = [], []
389
+ for instance in instances:
390
+ if instance["token"] in new_train_instance_ids:
391
+ new_train_instances.append(instance)
392
+ elif instance["token"] in new_test_instance_ids:
393
+ new_test_instances.append(instance)
394
+ else:
395
+ raise ValueError(
396
+ f"Instance {instance['token']} doesn't belong to either new train or new test set"
397
+ )
398
+ logger.info(
399
+ f"New train instances: {len(new_train_instances)}, new test instances: {len(new_test_instances)}"
400
+ )
401
+ assert len(new_train_instances) + len(new_test_instances) == len(
402
+ instances
403
+ ), f"New train ({len(new_train_instances)}) and test ({len(new_test_instances)}) instances must cover all instances ({len(instances)})"
404
+ with open(public / "train_data" / "instance.json", "w") as f:
405
+ json.dump(new_train_instances, f)
406
+ # NOTE: don't export (public / "test_data" / "instance.json") since the test set doesn't provide instance annotations
407
+
408
+ """
409
+ Copy over the heavy image and lidar data
410
+ """
411
+ (public / "test_images").mkdir(parents=True, exist_ok=True)
412
+ (public / "train_images").mkdir(parents=True, exist_ok=True)
413
+ (public / "test_lidar").mkdir(parents=True, exist_ok=True)
414
+ (public / "train_lidar").mkdir(parents=True, exist_ok=True)
415
+ if DEV_MODE:
416
+ sample_data = sample_data[:100] # Just copy a few samples for testing
417
+ num_train_images, num_test_images = 0, 0
418
+ num_train_lidar, num_test_lidar = 0, 0
419
+ for sample_datum in tqdm(sample_data, desc="Copying images and lidar data"):
420
+ filename = Path(
421
+ sample_datum["filename"]
422
+ ).name # `filename` looks like "images/host-a011_cam2_1233689008717605006.jpeg", but we don't use that parent directory
423
+
424
+ is_test = sample_datum["sample_token"] in new_test_df["Id"].values
425
+
426
+ if sample_datum["fileformat"] == "jpeg":
427
+ assert filename.endswith("jpeg"), f"Expected .jpeg, but got {filename}"
428
+ src_file = raw / "train_images" / filename
429
+ if not src_file.exists():
430
+ raise FileNotFoundError(f"{src_file} does not exist")
431
+ # Image
432
+ if is_test:
433
+ dst_file = public / "test_images" / filename
434
+ if dst_file.exists():
435
+ logger.warning(f"Copying file to {dst_file}, but file already exists!")
436
+ else:
437
+ shutil.copyfile(src=src_file, dst=dst_file)
438
+ num_test_images += 1
439
+ else:
440
+ dst_file = public / "train_images" / filename
441
+ if dst_file.exists():
442
+ logger.warning(f"Copying file to {dst_file}, but file already exists!")
443
+ else:
444
+ shutil.copyfile(src=src_file, dst=dst_file)
445
+ num_train_images += 1
446
+ elif sample_datum["fileformat"] == "bin":
447
+ assert filename.endswith("bin"), f"Expected .bin, but got {filename}"
448
+ src_file = raw / "train_lidar" / filename
449
+ if not src_file.exists():
450
+ raise FileNotFoundError(f"{src_file} does not exist")
451
+ # Lidar
452
+ if is_test:
453
+ dst_file = public / "test_lidar" / filename
454
+ if dst_file.exists():
455
+ logger.warning(f"Copying file to {dst_file}, but file already exists!")
456
+ else:
457
+ shutil.copyfile(src=src_file, dst=dst_file)
458
+ num_test_lidar += 1
459
+ else:
460
+ dst_file = public / "train_lidar" / filename
461
+ if dst_file.exists():
462
+ logger.warning(f"Copying file to {dst_file}, but file already exists!")
463
+ else:
464
+ shutil.copyfile(src=src_file, dst=dst_file)
465
+ num_train_lidar += 1
466
+ else:
467
+ raise ValueError(
468
+ f"Unexpected `fileformat` in sample data: {sample_datum['fileformat']}"
469
+ )
470
+ assert num_train_images + num_test_images + num_train_lidar + num_test_lidar == len(
471
+ set(sample_datum["filename"] for sample_datum in sample_data)
472
+ ), f"Expected image and lidar samples for new train/test to cover all samples ({len(sample_data)})"
473
+ assert num_train_images == len(
474
+ list((public / "train_images").glob("*.jpeg"))
475
+ ), f"Expected {num_train_images} train images, but got {len(list((public / 'train_images').glob('*.jpeg')))}"
476
+ assert num_test_images == len(
477
+ list((public / "test_images").glob("*.jpeg"))
478
+ ), f"Expected {num_test_images} test images, but got {len(list((public / 'test_images').glob('*.jpeg')))}"
479
+ assert num_train_lidar == len(
480
+ list((public / "train_lidar").glob("*.bin"))
481
+ ), f"Expected {num_train_lidar} train lidar files, but got {len(list((public / 'train_lidar').glob('*.bin')))}"
482
+ assert num_test_lidar == len(
483
+ list((public / "test_lidar").glob("*.bin"))
484
+ ), f"Expected {num_test_lidar} test lidar files, but got {len(list((public / 'test_lidar').glob('*.bin')))}"
485
+
486
+ """
487
+ ========================================
488
+ VALIDATION SPLIT - Second split on train
489
+ ========================================
490
+ """
491
+ # Create paths for validation directories
492
+ public_val = public.parent / "public_val"
493
+ private_val = private.parent / "private_val"
494
+
495
+ (public_val / "test_data").mkdir(parents=True, exist_ok=True)
496
+ (public_val / "train_data").mkdir(parents=True, exist_ok=True)
497
+ private_val.mkdir(parents=True, exist_ok=True)
498
+
499
+ # Second split on train_log_ids
500
+ train_val_log_ids, test_val_log_ids = train_test_split(train_log_ids, test_size=0.2, random_state=0)
501
+ logger.info(f"Validation split - Train logs: {len(train_val_log_ids)}, Test logs: {len(test_val_log_ids)}")
502
+
503
+ # Write validation log.json files
504
+ with open(public_val / "train_data" / "log.json", "w") as f:
505
+ json.dump([log for log in logs if log["token"] in train_val_log_ids], f)
506
+ with open(public_val / "test_data" / "log.json", "w") as f:
507
+ json.dump([log for log in logs if log["token"] in test_val_log_ids], f)
508
+
509
+ # Split train_samples into train_val and test_val
510
+ train_val_samples = [sample for sample in train_samples if sample["scene_token"] in train_val_log_ids]
511
+ test_val_samples = [sample for sample in train_samples if sample["scene_token"] in test_val_log_ids]
512
+ logger.info(f"Validation samples - Train: {len(train_val_samples)}, Test: {len(test_val_samples)}")
513
+
514
+ with open(public_val / "train_data" / "sample.json", "w") as f:
515
+ json.dump(train_val_samples, f)
516
+ with open(public_val / "test_data" / "sample.json", "w") as f:
517
+ json.dump(test_val_samples, f)
518
+
519
+ # Create validation train.csv from new_train_df
520
+ train_val_df = new_train_df[new_train_df["Id"].isin([sample["token"] for sample in train_val_samples])]
521
+ test_val_df = new_train_df[new_train_df["Id"].isin([sample["token"] for sample in test_val_samples])]
522
+ logger.info(f"Validation CSV - Train rows: {len(train_val_df)}, Test rows: {len(test_val_df)}")
523
+ train_val_df.to_csv(public_val / "train.csv", index=False)
524
+
525
+ # Create validation test.csv with confidence added
526
+ test_val_df_copy = test_val_df.copy()
527
+ test_val_df_copy["PredictionString"] = test_val_df_copy["PredictionString"].apply(add_confidence)
528
+ test_val_df_copy.to_csv(private_val / "test.csv", index=False)
529
+
530
+ # Create validation sample_submission.csv
531
+ sample_submission_val = test_val_df[["Id"]].copy()
532
+ sample_submission_val["PredictionString"] = ""
533
+ sample_submission_val.to_csv(public_val / "sample_submission.csv", index=False)
534
+
535
+ # Split sample_data for validation
536
+ train_val_sample_data, test_val_sample_data = [], []
537
+ for sample_datum in new_train_sample_data:
538
+ sample_token = sample_datum["sample_token"]
539
+ if sample_token in train_val_df["Id"].values:
540
+ train_val_sample_data.append(sample_datum)
541
+ elif sample_token in test_val_df["Id"].values:
542
+ test_val_sample_data.append(sample_datum)
543
+
544
+ logger.info(f"Validation sample data - Train: {len(train_val_sample_data)}, Test: {len(test_val_sample_data)}")
545
+
546
+ with open(public_val / "train_data" / "sample_data.json", "w") as f:
547
+ json.dump(train_val_sample_data, f)
548
+ with open(public_val / "test_data" / "sample_data.json", "w") as f:
549
+ json.dump(test_val_sample_data, f)
550
+
551
+ # Copy maps for validation
552
+ (public_val / "test_maps").mkdir(parents=True, exist_ok=True)
553
+ (public_val / "train_maps").mkdir(parents=True, exist_ok=True)
554
+ shutil.copyfile(
555
+ src=raw / "train_maps" / "map_raster_palo_alto.png",
556
+ dst=public_val / "test_maps" / "map_raster_palo_alto.png",
557
+ )
558
+ shutil.copyfile(
559
+ src=raw / "train_maps" / "map_raster_palo_alto.png",
560
+ dst=public_val / "train_maps" / "map_raster_palo_alto.png",
561
+ )
562
+
563
+ # Copy attribute.json for validation
564
+ shutil.copyfile(
565
+ src=raw / "train_data" / "attribute.json", dst=public_val / "train_data" / "attribute.json"
566
+ )
567
+ shutil.copyfile(
568
+ src=raw / "train_data" / "attribute.json", dst=public_val / "test_data" / "attribute.json"
569
+ )
570
+
571
+ # Split calibrated_sensor.json for validation
572
+ train_val_calibrated_sensors, test_val_calibrated_sensors = [], []
573
+ for sample_datum in train_val_sample_data:
574
+ calibrated_sensor_token = sample_datum["calibrated_sensor_token"]
575
+ if calibrated_sensor_token in [cal["token"] for cal in train_val_calibrated_sensors]:
576
+ continue
577
+ train_val_calibrated_sensors.append(
578
+ calibration_by_calibrated_sensor_token[calibrated_sensor_token]
579
+ )
580
+ for sample_datum in test_val_sample_data:
581
+ calibrated_sensor_token = sample_datum["calibrated_sensor_token"]
582
+ if calibrated_sensor_token in [cal["token"] for cal in test_val_calibrated_sensors]:
583
+ continue
584
+ test_val_calibrated_sensors.append(
585
+ calibration_by_calibrated_sensor_token[calibrated_sensor_token]
586
+ )
587
+
588
+ with open(public_val / "train_data" / "calibrated_sensor.json", "w") as f:
589
+ json.dump(train_val_calibrated_sensors, f)
590
+ with open(public_val / "test_data" / "calibrated_sensor.json", "w") as f:
591
+ json.dump(test_val_calibrated_sensors, f)
592
+
593
+ # Copy category.json for validation
594
+ shutil.copyfile(
595
+ src=raw / "train_data" / "category.json", dst=public_val / "train_data" / "category.json"
596
+ )
597
+ shutil.copyfile(
598
+ src=raw / "train_data" / "category.json", dst=public_val / "test_data" / "category.json"
599
+ )
600
+
601
+ # Split ego_pose.json for validation
602
+ train_val_ego_poses, test_val_ego_poses = [], []
603
+ for sample_datum in train_val_sample_data:
604
+ ego_pose_token = sample_datum["ego_pose_token"]
605
+ train_val_ego_poses.append(ego_pose_by_ego_pose_token[ego_pose_token])
606
+ for sample_datum in test_val_sample_data:
607
+ ego_pose_token = sample_datum["ego_pose_token"]
608
+ test_val_ego_poses.append(ego_pose_by_ego_pose_token[ego_pose_token])
609
+
610
+ with open(public_val / "train_data" / "ego_pose.json", "w") as f:
611
+ json.dump(train_val_ego_poses, f)
612
+ with open(public_val / "test_data" / "ego_pose.json", "w") as f:
613
+ json.dump(test_val_ego_poses, f)
614
+
615
+ # Create map.json for validation
616
+ train_val_maps = maps.copy()
617
+ train_val_maps[0]["log_tokens"] = train_val_log_ids
618
+ test_val_maps = maps.copy()
619
+ test_val_maps[0]["log_tokens"] = test_val_log_ids
620
+ with open(public_val / "train_data" / "map.json", "w") as f:
621
+ json.dump(train_val_maps, f)
622
+ with open(public_val / "test_data" / "map.json", "w") as f:
623
+ json.dump(test_val_maps, f)
624
+
625
+ # Split scene.json for validation
626
+ train_val_scenes, test_val_scenes = [], []
627
+ for scene in new_train_scenes:
628
+ log_token = scene["log_token"]
629
+ if log_token in train_val_log_ids:
630
+ train_val_scenes.append(scene)
631
+ elif log_token in test_val_log_ids:
632
+ test_val_scenes.append(scene)
633
+
634
+ with open(public_val / "train_data" / "scene.json", "w") as f:
635
+ json.dump(train_val_scenes, f)
636
+ with open(public_val / "test_data" / "scene.json", "w") as f:
637
+ json.dump(test_val_scenes, f)
638
+
639
+ # Copy sensor.json for validation
640
+ shutil.copyfile(
641
+ src=raw / "train_data" / "sensor.json", dst=public_val / "train_data" / "sensor.json"
642
+ )
643
+ shutil.copyfile(
644
+ src=raw / "train_data" / "sensor.json", dst=public_val / "test_data" / "sensor.json"
645
+ )
646
+
647
+ # Copy visibility.json for validation
648
+ shutil.copyfile(
649
+ src=raw / "train_data" / "visibility.json", dst=public_val / "train_data" / "visibility.json"
650
+ )
651
+ shutil.copyfile(
652
+ src=raw / "train_data" / "visibility.json", dst=public_val / "test_data" / "visibility.json"
653
+ )
654
+
655
+ # Split sample_annotation.json for validation
656
+ train_val_sample_annotations, test_val_sample_annotations = [], []
657
+ for sample_annotation in new_train_sample_annotations:
658
+ sample_token = sample_annotation["sample_token"]
659
+ if sample_token in train_val_df["Id"].values:
660
+ train_val_sample_annotations.append(sample_annotation)
661
+ elif sample_token in test_val_df["Id"].values:
662
+ test_val_sample_annotations.append(sample_annotation)
663
+
664
+ with open(public_val / "train_data" / "sample_annotation.json", "w") as f:
665
+ json.dump(train_val_sample_annotations, f)
666
+
667
+ # Split instance.json for validation
668
+ train_val_instance_ids = set([sa["instance_token"] for sa in train_val_sample_annotations])
669
+ test_val_instance_ids = set([sa["instance_token"] for sa in test_val_sample_annotations])
670
+ train_val_instances, test_val_instances = [], []
671
+ for instance in new_train_instances:
672
+ if instance["token"] in train_val_instance_ids:
673
+ train_val_instances.append(instance)
674
+ elif instance["token"] in test_val_instance_ids:
675
+ test_val_instances.append(instance)
676
+
677
+ with open(public_val / "train_data" / "instance.json", "w") as f:
678
+ json.dump(train_val_instances, f)
679
+
680
+ # Copy images and lidar data for validation
681
+ (public_val / "test_images").mkdir(parents=True, exist_ok=True)
682
+ (public_val / "train_images").mkdir(parents=True, exist_ok=True)
683
+ (public_val / "test_lidar").mkdir(parents=True, exist_ok=True)
684
+ (public_val / "train_lidar").mkdir(parents=True, exist_ok=True)
685
+
686
+ num_train_val_images, num_test_val_images = 0, 0
687
+ num_train_val_lidar, num_test_val_lidar = 0, 0
688
+
689
+ for sample_datum in tqdm(new_train_sample_data, desc="Copying validation images and lidar data"):
690
+ filename = Path(sample_datum["filename"]).name
691
+ is_test_val = sample_datum["sample_token"] in test_val_df["Id"].values
692
+
693
+ if sample_datum["fileformat"] == "jpeg":
694
+ src_file = public / "train_images" / filename
695
+ if is_test_val:
696
+ dst_file = public_val / "test_images" / filename
697
+ if not dst_file.exists():
698
+ shutil.copyfile(src=src_file, dst=dst_file)
699
+ num_test_val_images += 1
700
+ else:
701
+ dst_file = public_val / "train_images" / filename
702
+ if not dst_file.exists():
703
+ shutil.copyfile(src=src_file, dst=dst_file)
704
+ num_train_val_images += 1
705
+ elif sample_datum["fileformat"] == "bin":
706
+ src_file = public / "train_lidar" / filename
707
+ if is_test_val:
708
+ dst_file = public_val / "test_lidar" / filename
709
+ if not dst_file.exists():
710
+ shutil.copyfile(src=src_file, dst=dst_file)
711
+ num_test_val_lidar += 1
712
+ else:
713
+ dst_file = public_val / "train_lidar" / filename
714
+ if not dst_file.exists():
715
+ shutil.copyfile(src=src_file, dst=dst_file)
716
+ num_train_val_lidar += 1
717
+
718
+ logger.info(f"Validation images - Train: {num_train_val_images}, Test: {num_test_val_images}")
719
+ logger.info(f"Validation lidar - Train: {num_train_val_lidar}, Test: {num_test_val_lidar}")