julearn 0.3.4.dev43__tar.gz → 0.3.5.dev16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (244) hide show
  1. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/PKG-INFO +1 -1
  2. julearn-0.3.5.dev16/docs/api/config.rst +21 -0
  3. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/api/index.rst +1 -0
  4. julearn-0.3.5.dev16/docs/changes/newsfragments/277.enh +1 -0
  5. julearn-0.3.5.dev16/docs/changes/newsfragments/278.doc +1 -0
  6. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/conf.py +0 -37
  7. julearn-0.3.5.dev16/docs/images/joblib_htcondor/condor_q.png +0 -0
  8. julearn-0.3.5.dev16/docs/images/joblib_htcondor/ui_main.png +0 -0
  9. julearn-0.3.5.dev16/docs/images/joblib_htcondor/ui_open.png +0 -0
  10. julearn-0.3.5.dev16/docs/images/joblib_htcondor/ui_stacked.png +0 -0
  11. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/links.inc +2 -0
  12. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/selected_deeper_topics/index.rst +1 -0
  13. julearn-0.3.5.dev16/docs/selected_deeper_topics/joblib.rst +564 -0
  14. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/_version.py +2 -2
  15. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/transformers/dataframe/filter_columns.py +7 -1
  16. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn.egg-info/PKG-INFO +1 -1
  17. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn.egg-info/SOURCES.txt +8 -0
  18. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/.github/ISSUE_TEMPLATE/bug_report.yaml +0 -0
  19. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  20. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/.github/ISSUE_TEMPLATE/documentation_request.yaml +0 -0
  21. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  22. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/.github/workflows/check-stale.yml +0 -0
  23. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/.github/workflows/ci-docs.yml +0 -0
  24. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/.github/workflows/ci.yml +0 -0
  25. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/.github/workflows/docs-preview.yml +0 -0
  26. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/.github/workflows/docs.yml +0 -0
  27. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/.github/workflows/lint.yml +0 -0
  28. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/.github/workflows/pypi.yml +0 -0
  29. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/.gitignore +0 -0
  30. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/.pre-commit-config.yaml +0 -0
  31. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/AUTHORS.rst +0 -0
  32. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/LICENSE.md +0 -0
  33. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/README.md +0 -0
  34. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/codecov.yml +0 -0
  35. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/Makefile +0 -0
  36. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/_static/css/custom.css +0 -0
  37. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/_static/js/custom.js +0 -0
  38. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/_templates/class.rst +0 -0
  39. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/_templates/function.rst +0 -0
  40. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/_templates/function_warning.rst +0 -0
  41. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/_templates/versions.html +0 -0
  42. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/api/base.rst +0 -0
  43. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/api/inspect.rst +0 -0
  44. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/api/main.rst +0 -0
  45. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/api/model_selection.rst +0 -0
  46. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/api/models.rst +0 -0
  47. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/api/pipeline.rst +0 -0
  48. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/api/prepare.rst +0 -0
  49. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/api/scoring.rst +0 -0
  50. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/api/stats.rst +0 -0
  51. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/api/transformers.rst +0 -0
  52. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/api/utils.rst +0 -0
  53. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/api/viz.rst +0 -0
  54. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/available_pipeline_steps.rst +0 -0
  55. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/changes/contributors.inc +0 -0
  56. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/changes/newsfragments/.gitignore +0 -0
  57. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/configuration.rst +0 -0
  58. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/contributing.rst +0 -0
  59. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/examples.rst +0 -0
  60. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/faq.rst +0 -0
  61. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/getting_started.rst +0 -0
  62. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/images/corrected_ttest.png +0 -0
  63. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/images/final_estimator.png +0 -0
  64. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/images/iris_X.png +0 -0
  65. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/images/iris_df.png +0 -0
  66. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/images/iris_y.png +0 -0
  67. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/images/julearn_logo.png +0 -0
  68. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/images/julearn_logo_calm.png +0 -0
  69. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/images/julearn_logo_confbias.png +0 -0
  70. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/images/julearn_logo_cv.png +0 -0
  71. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/images/julearn_logo_generalization.png +0 -0
  72. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/images/julearn_logo_it.png +0 -0
  73. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/images/julearn_logo_ml.png +0 -0
  74. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/images/julearn_logo_mlit.png +0 -0
  75. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/images/multiple_scorers_run_cv.png +0 -0
  76. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/images/plot_scores.png +0 -0
  77. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/images/scores_run_cv.png +0 -0
  78. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/images/scores_run_cv_splitter.png +0 -0
  79. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/images/scores_run_cv_train.png +0 -0
  80. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/index.rst +0 -0
  81. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/maintaining.rst +0 -0
  82. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/redirect.html +0 -0
  83. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/selected_deeper_topics/CBPM.rst +0 -0
  84. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/selected_deeper_topics/confound_removal.rst +0 -0
  85. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/selected_deeper_topics/cross_validation_splitter.rst +0 -0
  86. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/selected_deeper_topics/hyperparameter_tuning.rst +0 -0
  87. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/selected_deeper_topics/model_inspect.rst +0 -0
  88. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/selected_deeper_topics/stacked_models.rst +0 -0
  89. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/selected_deeper_topics/target_transformers.rst +0 -0
  90. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/sphinxext/gh_substitutions.py +0 -0
  91. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/what_really_need_know/cross_validation.rst +0 -0
  92. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/what_really_need_know/data.rst +0 -0
  93. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/what_really_need_know/index.rst +0 -0
  94. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/what_really_need_know/model_comparison.rst +0 -0
  95. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/what_really_need_know/model_evaluation.rst +0 -0
  96. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/what_really_need_know/pipeline.rst +0 -0
  97. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/docs/whats_new.rst +0 -0
  98. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/00_starting/README.rst +0 -0
  99. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/00_starting/plot_cm_acc_multiclass.py +0 -0
  100. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/00_starting/plot_example_regression.py +0 -0
  101. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/00_starting/plot_stratified_kfold_reg.py +0 -0
  102. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/00_starting/run_combine_pandas.py +0 -0
  103. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/00_starting/run_grouped_cv.py +0 -0
  104. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/00_starting/run_simple_binary_classification.py +0 -0
  105. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/01_model_comparison/README.rst +0 -0
  106. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/01_model_comparison/plot_simple_model_comparison.py +0 -0
  107. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/02_inspection/README.rst +0 -0
  108. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/02_inspection/plot_groupcv_inspect_svm.py +0 -0
  109. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/02_inspection/plot_inspect_random_forest.py +0 -0
  110. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/02_inspection/plot_preprocess.py +0 -0
  111. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/02_inspection/run_binary_inspect_folds.py +0 -0
  112. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/03_complex_models/README.rst +0 -0
  113. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/03_complex_models/run_apply_to_target.py +0 -0
  114. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/03_complex_models/run_example_pca_featsets.py +0 -0
  115. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/03_complex_models/run_hyperparameter_multiple_grids.py +0 -0
  116. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/03_complex_models/run_hyperparameter_tuning.py +0 -0
  117. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/03_complex_models/run_hyperparameter_tuning_bayessearch.py +0 -0
  118. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/03_complex_models/run_stacked_models.py +0 -0
  119. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/04_confounds/README.rst +0 -0
  120. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/04_confounds/plot_confound_removal_classification.py +0 -0
  121. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/04_confounds/run_return_confounds.py +0 -0
  122. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/05_customization/README.rst +0 -0
  123. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/05_customization/run_custom_scorers_regression.py +0 -0
  124. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/99_docs/README.rst +0 -0
  125. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/99_docs/run_cbpm_docs.py +0 -0
  126. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/99_docs/run_confound_removal_docs.py +0 -0
  127. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/99_docs/run_cv_splitters_docs.py +0 -0
  128. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/99_docs/run_data_docs.py +0 -0
  129. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/99_docs/run_hyperparameters_docs.py +0 -0
  130. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/99_docs/run_model_comparison_docs.py +0 -0
  131. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/99_docs/run_model_evaluation_docs.py +0 -0
  132. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/99_docs/run_model_inspection_docs.py +0 -0
  133. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/99_docs/run_pipeline_docs.py +0 -0
  134. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/99_docs/run_stacked_models_docs.py +0 -0
  135. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/99_docs/run_target_transformer_docs.py +0 -0
  136. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/README.rst +0 -0
  137. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/XX_disabled/dis_run_n_jobs.py +0 -0
  138. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/examples/XX_disabled/dis_run_target_confound_removal.py +0 -0
  139. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/ignore_words.txt +0 -0
  140. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/__init__.py +0 -0
  141. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/api.py +0 -0
  142. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/base/__init__.py +0 -0
  143. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/base/column_types.py +0 -0
  144. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/base/estimators.py +0 -0
  145. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/base/tests/test_base_estimators.py +0 -0
  146. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/base/tests/test_column_types.py +0 -0
  147. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/config.py +0 -0
  148. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/conftest.py +0 -0
  149. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/external/optuna_searchcv.py +0 -0
  150. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/inspect/__init__.py +0 -0
  151. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/inspect/_cv.py +0 -0
  152. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/inspect/_pipeline.py +0 -0
  153. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/inspect/_preprocess.py +0 -0
  154. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/inspect/inspector.py +0 -0
  155. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/inspect/tests/test_cv.py +0 -0
  156. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/inspect/tests/test_inspector.py +0 -0
  157. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/inspect/tests/test_pipeline.py +0 -0
  158. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/inspect/tests/test_preprocess.py +0 -0
  159. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/model_selection/__init__.py +0 -0
  160. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/model_selection/_optuna_searcher.py +0 -0
  161. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/model_selection/_skopt_searcher.py +0 -0
  162. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/model_selection/available_searchers.py +0 -0
  163. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/model_selection/continuous_stratified_kfold.py +0 -0
  164. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/model_selection/final_model_cv.py +0 -0
  165. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/model_selection/stratified_bootstrap.py +0 -0
  166. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/model_selection/tests/test_available_searchers.py +0 -0
  167. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/model_selection/tests/test_continous_stratified_kfold.py +0 -0
  168. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/model_selection/tests/test_final_model_cv.py +0 -0
  169. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/model_selection/tests/test_optuna_searcher.py +0 -0
  170. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/model_selection/tests/test_skopt_searcher.py +0 -0
  171. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/model_selection/tests/test_stratified_bootstrap.py +0 -0
  172. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/model_selection/utils.py +0 -0
  173. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/models/__init__.py +0 -0
  174. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/models/available_models.py +0 -0
  175. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/models/dynamic.py +0 -0
  176. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/models/tests/test_available_models.py +0 -0
  177. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/models/tests/test_dynamic.py +0 -0
  178. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/models/tests/test_models.py +0 -0
  179. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/pipeline/__init__.py +0 -0
  180. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/pipeline/merger.py +0 -0
  181. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/pipeline/pipeline_creator.py +0 -0
  182. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/pipeline/target_pipeline.py +0 -0
  183. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/pipeline/target_pipeline_creator.py +0 -0
  184. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/pipeline/tests/test_merger.py +0 -0
  185. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/pipeline/tests/test_pipeline_creator.py +0 -0
  186. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/pipeline/tests/test_target_pipeline.py +0 -0
  187. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/pipeline/tests/test_target_pipeline_creator.py +0 -0
  188. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/prepare.py +0 -0
  189. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/scoring/__init__.py +0 -0
  190. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/scoring/available_scorers.py +0 -0
  191. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/scoring/metrics.py +0 -0
  192. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/scoring/tests/test_available_scorers.py +0 -0
  193. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/scoring/tests/test_metrics.py +0 -0
  194. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/stats/__init__.py +0 -0
  195. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/stats/corrected_ttest.py +0 -0
  196. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/stats/tests/test_corrected_ttest.py +0 -0
  197. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/tests/test_api.py +0 -0
  198. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/tests/test_config.py +0 -0
  199. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/tests/test_prepare.py +0 -0
  200. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/transformers/__init__.py +0 -0
  201. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/transformers/available_transformers.py +0 -0
  202. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/transformers/cbpm.py +0 -0
  203. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/transformers/confound_remover.py +0 -0
  204. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/transformers/dataframe/__init__.py +0 -0
  205. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/transformers/dataframe/change_column_types.py +0 -0
  206. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/transformers/dataframe/drop_columns.py +0 -0
  207. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/transformers/dataframe/set_column_types.py +0 -0
  208. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/transformers/dataframe/tests/test_change_column_types.py +0 -0
  209. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/transformers/dataframe/tests/test_drop_columns.py +0 -0
  210. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/transformers/dataframe/tests/test_filter_columns.py +0 -0
  211. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/transformers/dataframe/tests/test_set_column_types.py +0 -0
  212. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/transformers/ju_column_transformer.py +0 -0
  213. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/transformers/target/__init__.py +0 -0
  214. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/transformers/target/available_target_transformers.py +0 -0
  215. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/transformers/target/ju_target_transformer.py +0 -0
  216. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/transformers/target/ju_transformed_target_model.py +0 -0
  217. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/transformers/target/target_confound_remover.py +0 -0
  218. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/transformers/target/tests/test_available_target_transformers.py +0 -0
  219. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/transformers/target/tests/test_ju_target_transformer.py +0 -0
  220. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/transformers/target/tests/test_ju_transformed_target_model.py +0 -0
  221. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/transformers/target/tests/test_target_confound_remover.py +0 -0
  222. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/transformers/tests/test_available_transformers.py +0 -0
  223. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/transformers/tests/test_cbpm.py +0 -0
  224. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/transformers/tests/test_confounds.py +0 -0
  225. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/transformers/tests/test_jucolumntransformers.py +0 -0
  226. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/utils/__init__.py +0 -0
  227. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/utils/_cv.py +0 -0
  228. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/utils/checks.py +0 -0
  229. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/utils/logging.py +0 -0
  230. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/utils/testing.py +0 -0
  231. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/utils/tests/test_logging.py +0 -0
  232. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/utils/tests/test_version.py +0 -0
  233. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/utils/typing.py +0 -0
  234. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/utils/versions.py +0 -0
  235. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/viz/__init__.py +0 -0
  236. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/viz/_scores.py +0 -0
  237. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn/viz/res/julearn_logo_generalization.png +0 -0
  238. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn.egg-info/dependency_links.txt +0 -0
  239. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn.egg-info/requires.txt +0 -0
  240. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/julearn.egg-info/top_level.txt +0 -0
  241. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/pyproject.toml +0 -0
  242. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/setup.cfg +0 -0
  243. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/setup.py +0 -0
  244. {julearn-0.3.4.dev43 → julearn-0.3.5.dev16}/tox.ini +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: julearn
3
- Version: 0.3.4.dev43
3
+ Version: 0.3.5.dev16
4
4
  Summary: Juelich Machine Learning Library
5
5
  Author-email: Fede Raimondo <f.raimondo@fz-juelich.de>, Sami Hamdan <s.hamdan@fz-juelich.de>
6
6
  Maintainer-email: Sami Hamdan <s.hamdan@fz-juelich.de>
@@ -0,0 +1,21 @@
1
+ Config
2
+ ======
3
+
4
+ .. automodule:: julearn.config
5
+ :no-members:
6
+ :no-inherited-members:
7
+
8
+
9
+ See :ref:`configuration` for more information on the flags that can be set.
10
+
11
+ Functions
12
+ ---------
13
+
14
+ .. currentmodule:: julearn.config
15
+
16
+ .. autosummary::
17
+ :toctree: generated/
18
+ :template: function.rst
19
+
20
+ set_config
21
+ get_config
@@ -18,3 +18,4 @@ API Reference
18
18
  prepare.rst
19
19
  stats.rst
20
20
  viz.rst
21
+ config.rst
@@ -0,0 +1 @@
1
+ Avoid parallel calls in :class:`.FilterColumns` if not specified by the user in the :mod:`.config` module by `Fede Raimondo`_
@@ -0,0 +1 @@
1
+ Include documentation on how to use the `joblib`_ and `joblib-htcondor`_ library to parallelize computation by `Fede Raimondo`_
@@ -211,42 +211,6 @@ numpydoc_xref_ignore = {
211
211
 
212
212
  # -- Sphinx-Gallery configuration --------------------------------------------
213
213
 
214
-
215
- class SubSectionTitleOrder:
216
- """Sort example gallery by title of subsection.
217
-
218
- Assumes README.txt exists for all subsections and uses the subsection with
219
- dashes, '---', as the adornment.
220
- """
221
-
222
- def __init__(self, src_dir):
223
- self.src_dir = src_dir
224
- self.regex = re.compile(r"^([\w ]+)\n-", re.MULTILINE)
225
-
226
- def __repr__(self):
227
- return f"<{self.__class__.__name__}>"
228
-
229
- def __call__(self, directory):
230
- src_path = os.path.normpath(os.path.join(self.src_dir, directory))
231
-
232
- # Forces Release Highlights to the top
233
- if os.path.basename(src_path) == "release_highlights":
234
- return "0"
235
-
236
- readme = os.path.join(src_path, "README.txt")
237
-
238
- try:
239
- with open(readme) as f:
240
- content = f.read()
241
- except FileNotFoundError:
242
- return directory
243
-
244
- title_match = self.regex.search(content)
245
- if title_match is not None:
246
- return title_match.group(1)
247
- return directory
248
-
249
-
250
214
  ex_dirs = [
251
215
  "00_starting",
252
216
  "01_model_comparison",
@@ -269,7 +233,6 @@ sphinx_gallery_conf = {
269
233
  "examples_dirs": example_dirs,
270
234
  "gallery_dirs": gallery_dirs,
271
235
  "nested_sections": True,
272
- "subsection_order": SubSectionTitleOrder("../examples"),
273
236
  "filename_pattern": "/(plot|run)_",
274
237
  "download_all_examples": False,
275
238
  }
@@ -43,3 +43,5 @@
43
43
  .. _`scikit-optimize`: https://scikit-optimize.readthedocs.io/en/stable/
44
44
  .. _`Optuna`: https://optuna.org
45
45
  .. _`optuna_integration`: https://github.com/optuna/optuna-integration
46
+ .. _`Joblib`: https://joblib.readthedocs.io/en/stable/
47
+ .. _`joblib-htcondor`: https://github.com/juaml/joblib-htcondor
@@ -16,3 +16,4 @@ Selected deeper topics
16
16
  cross_validation_splitter.rst
17
17
  stacked_models.rst
18
18
  CBPM.rst
19
+ joblib.rst
@@ -0,0 +1,564 @@
1
+ .. include:: ../links.inc
2
+
3
+
4
+ .. _joblib_parallel:
5
+
6
+ Parallelizing julearn with Joblib
7
+ =================================
8
+
9
+ .. warning::
10
+ Make sure you are using the latest version of ``julearn``, as we are
11
+ actively developing and fine-tuning these packages to improve performance.
12
+ Older versions of ``julearn`` might have a huge computational impact when used
13
+ with joblib.
14
+
15
+ As with `scikit-learn`_, ``julearn`` allows you to parallelize your code using
16
+ `Joblib`_. This can be particularly useful when you have a large dataset or
17
+ when you are running a computationally expensive operation that can be easily
18
+ computed in parallel.
19
+
20
+ Without going into details about parallel and distributed computing, the idea
21
+ is to split the computation into smaller tasks that can be executed
22
+ independently from each other. This way, you can take advantage of multiple
23
+ *processors* to do them in parallel. A very clear example of this situation is
24
+ when you are estimating a model's performance using cross-validation. In this
25
+ case, you can parallelize the computation of the different folds, as the
26
+ training and testing of each fold are independent from the other folds.
27
+
28
+ Mostly all modern computers have multiple processors or *cores*, which allows
29
+ to run multiple tasks at the same time. If you are familiar, you might
30
+ have noticed that scikit-learn already has a parallelization mechanism using
31
+ the ``n_jobs`` parameter. ``julearn`` is actually using scikit-learn, so it is
32
+ possible to use the ``n_jobs`` parameter. If you want to read more about how
33
+ scikit-learn parallelizes its code, you can check the section
34
+ :external+sklearn:ref:`parallelism` on scikit-learn's documentation.
35
+
36
+ In short, the ``n_jobs`` parameter in scikit-learn allows you to specify the
37
+ number of jobs to run in parallel. If you set it to ``-1``, it will use all the
38
+ processors available, which is usually the best option for most cases.
39
+
40
+ One of the explicit ways to control parallelization in scikit-learn and
41
+ ``julearn`` is to use a joblib's
42
+ :external+joblib:class:`~joblib.parallel_config` context manager. The following
43
+ snippet will run the code in parallel using the ``"loky"`` backend
44
+ with 4 processors.
45
+
46
+ .. code-block:: python
47
+
48
+ from joblib import parallel_config
49
+
50
+ creator = PipelineCreator(problem_type="classification")
51
+ creator.add("zscore")
52
+ creator.add("svm")
53
+
54
+ with parallel_config(backend="loky", n_jobs=4):
55
+ scores = run_cross_validation(
56
+ X=X,
57
+ y=y,
58
+ X_types=X_types,
59
+ data=df_data,
60
+ model=creator,
61
+ problem_type="classification",
62
+ )
63
+
64
+
65
+ Importantly, this sets the number of parallel processes that joblib dispatches
66
+ but does not set the number of threads (i.e. another low-level parallelization
67
+ mechanism) that each process uses. The ``"loky"`` backend is quite intelligent,
68
+ but can't always determine the optimal number of threads to use if you don't
69
+ want to use 100% of the resources. You can set the number of threads per
70
+ process by setting the ``inner_max_num_threads`` parameter in the context
71
+ manager. For example, to make sure that only 4 processors are used and each
72
+ process uses only 1 thread, you can do the following:
73
+
74
+ .. code-block:: python
75
+
76
+ from joblib import parallel_config
77
+
78
+ creator = PipelineCreator(problem_type="classification")
79
+ creator.add("zscore")
80
+ creator.add("svm")
81
+
82
+ with parallel_config(backend="loky", n_jobs=4, inner_max_num_threads=1):
83
+ scores = run_cross_validation(
84
+ X=X,
85
+ y=y,
86
+ X_types=X_types,
87
+ data=df_data,
88
+ model=creator,
89
+ problem_type="classification",
90
+ )
91
+
92
+ Massively parallelizing ``julearn`` with Joblib and HTcondor
93
+ ------------------------------------------------------------
94
+
95
+ Sometimes even with multiple processors, the computation can take a long time.
96
+ As an example, assuming a model that takes 1 hour to fit, a 5 times 5-fold
97
+ cross-validation takes 25 hours of computation. If you add a grid search to
98
+ find the best hyperparameter using another 5-fold CV, and this grid has
99
+ 10 hyperparameters sets to test, this adds another 1250 hours of computation.
100
+ This is a total of 1275 hours. In technical terms, this are 1275 core-hours,
101
+ which is a unit of processing time in a single core. With 4 processors, this is
102
+ 318 hours, which is almost 13 days of computation.
103
+ If the model takes 10 hours to fit, this goes to 12750 core-hours, which is
104
+ almost 4.5 months with 4 processors.
105
+
106
+ As you can see in the following table, sometimes we might need to use
107
+ hundreds of processors to obtain results within reasonable time spans.
108
+
109
+ .. csv-table::
110
+ :align: center
111
+ :header: "Total core-hours", "Number of processors", "Time (approx.)"
112
+
113
+ "1250", "1", "52 days"
114
+ "1250", "4", "13 days"
115
+ "1250", "16", "3.25 days"
116
+ "12750", "1", "1.4 years"
117
+ "12750", "4", "4.4 months"
118
+ "12750", "16", "1.1 months"
119
+
120
+
121
+ At the `INM-7`_, were we have mainly developed this library, we have a
122
+ computational cluster that uses HTCondor. To overcome this limitation of
123
+ ``julearn`` and `joblib`_, we have created the `joblib-htcondor`_ backend. This
124
+ allows joblib to submit each task as a job in an HTCondor queue, allowing to
125
+ massively parallelize computation.
126
+
127
+ By simply calling ``register_htcondor`` from the ``joblib_htcondor`` package
128
+ and configuring the backend, we can easily parallelize the computations as in
129
+ the following example:
130
+
131
+
132
+ .. code-block:: python
133
+
134
+ from joblib import parallel_config
135
+ from joblib_htcondor import register_htcondor
136
+
137
+ register_htcondor("INFO") # Set logging level to INFO
138
+
139
+ creator = PipelineCreator(problem_type="classification")
140
+ creator.add("zscore")
141
+ creator.add("svm")
142
+
143
+ with parallel_config(
144
+ backend="htcondor",
145
+ n_jobs=-1,
146
+ request_cpus=1,
147
+ request_mem="4Gb",
148
+ ):
149
+ scores = run_cross_validation(
150
+ X=X,
151
+ y=y,
152
+ X_types=X_types,
153
+ data=df_data,
154
+ model=creator,
155
+ problem_type="classification",
156
+ cv=cv,
157
+ )
158
+
159
+
160
+ This will submit each task to the HTCondor queue, and the computation will be
161
+ done in parallel. The ``request_cpus`` parameter specifies the number of CPUs
162
+ to request for each job, and the ``request_mem`` parameter specifies the
163
+ amount of memory to request.
164
+
165
+
166
+ .. note::
167
+ Note that the ``register_htcondor`` function sets the logging level to
168
+ ``"INFO"``, which means that you will see information regarding the HTCondor
169
+ backend. If you want to see less information, you can set the logging level to
170
+ ``"WARNING"``. If you believe that there might be an issue with the backend,
171
+ you can set the logging level to ``"DEBUG"`` to see more information that
172
+ can be later shared with the developers.
173
+
174
+ Nevertheless, as it is, it will submit as many jobs as outer folds in the
175
+ cross-validation, and it will rarely work for large projects as we need to take
176
+ into account other factors.
177
+
178
+ Data Transfer
179
+ ~~~~~~~~~~~~~
180
+
181
+ When submitting jobs to a cluster, we need to take into account that the data
182
+ needs to be transferred between workers. The `joblib-htcondor`_ backend uses
183
+ filesystem-based data transfer, which means that the data will be stored in
184
+ a file that must be accessible by all the workers. The location of the shared
185
+ directory can be specified with the ``shared_data_dir`` parameter.
186
+
187
+ For example, at the INM-7, we can have a shared directory at
188
+ ``/data/project/supercool_research``. Our example then becomes:
189
+
190
+
191
+ .. code-block:: python
192
+
193
+ from joblib import parallel_config
194
+ from joblib_htcondor import register_htcondor
195
+
196
+ register_htcondor("INFO")
197
+
198
+ creator = PipelineCreator(problem_type="classification")
199
+ creator.add("zscore")
200
+ creator.add("svm")
201
+
202
+ with parallel_config(
203
+ backend="htcondor",
204
+ n_jobs=-1,
205
+ request_cpus=1,
206
+ request_mem="4Gb",
207
+ shared_data_dir="/data/project/supercool_research",
208
+ ):
209
+ scores = run_cross_validation(
210
+ X=X,
211
+ y=y,
212
+ X_types=X_types,
213
+ data=df_data,
214
+ model=creator,
215
+ problem_type="classification",
216
+ cv=cv,
217
+ )
218
+
219
+
220
+ Pool
221
+ ~~~~
222
+
223
+ As in any computational cluster, most probably you will be required to submit
224
+ a job to a queue, which will then run the :func:`.run_cross_validation`
225
+ function that will then submit more jobs to the queue. This is not a problem,
226
+ but it needs to be possible to submit jobs from within a job. Check with your
227
+ cluster's admin team and ask for further instructions. Most probably you'll
228
+ also need to specify to which `pool` the jobs will be submitted. This can be
229
+ done with the ``pool`` parameter. For us, this is ``head2.htc.inm7.de``:
230
+
231
+
232
+ .. code-block:: python
233
+
234
+ from joblib import parallel_config
235
+ from joblib_htcondor import register_htcondor
236
+
237
+ register_htcondor("INFO")
238
+
239
+ creator = PipelineCreator(problem_type="classification")
240
+ creator.add("zscore")
241
+ creator.add("svm")
242
+
243
+ with parallel_config(
244
+ backend="htcondor",
245
+ n_jobs=-1,
246
+ request_cpus=1,
247
+ request_mem="4Gb",
248
+ shared_data_dir="/data/project/supercool_research",
249
+ pool="head2.htc.inm7.de",
250
+ ):
251
+ scores = run_cross_validation(
252
+ X=X,
253
+ y=y,
254
+ X_types=X_types,
255
+ data=df_data,
256
+ model=creator,
257
+ problem_type="classification",
258
+ cv=cv,
259
+ )
260
+
261
+ Recursive parallelization
262
+ ~~~~~~~~~~~~~~~~~~~~~~~~~
263
+
264
+ Let's say we have the following pipeline:
265
+
266
+ .. code-block:: python
267
+
268
+ creator = PipelineCreator(problem_type="classification")
269
+ creator.add("zscore")
270
+ creator.add(
271
+ "svm",
272
+ kernel="rbf",
273
+ C=[0.001, 0.01, 0.1, 1, 10, 100, 1000],
274
+ gamma=[0.001, 0.01, 0.1, 1, 10, 100, 1000],
275
+ )
276
+
277
+ This is indeed a pipeline with a hyperparameter search. A 5-fold Grid Search
278
+ approach will evaluate 49 different models, 5 times each. And this will happen
279
+ for every outer fold. So if we use `joblib-htcondor`_ as in the previous
280
+ example, each task could also benefit from parallelism, by submitting each
281
+ inner fold of each hyperparameter combination as a separate job. This is
282
+ called recursive parallelization and we can instruct the backend to allow this
283
+ by setting the ``max_recursion_level`` parameter to ``1``:
284
+
285
+ .. code-block:: python
286
+
287
+ from joblib import parallel_config
288
+ from joblib_htcondor import register_htcondor
289
+
290
+ register_htcondor("INFO")
291
+
292
+ creator = PipelineCreator(problem_type="classification")
293
+ creator.add("zscore")
294
+ creator.add("svm")
295
+
296
+ with parallel_config(
297
+ backend="htcondor",
298
+ n_jobs=-1,
299
+ request_cpus=1,
300
+ request_mem="4Gb",
301
+ shared_data_dir="/data/project/supercool_research",
302
+ pool="head2.htc.inm7.de",
303
+ max_recursion_level=1,
304
+ ):
305
+ scores = run_cross_validation(
306
+ X=X,
307
+ y=y,
308
+ X_types=X_types,
309
+ data=df_data,
310
+ model=creator,
311
+ problem_type="classification",
312
+ cv=cv,
313
+ )
314
+
315
+ .. warning::
316
+ `scikit-learn`_ parallelizes many algorithms internally by default. So if
317
+ you set the ``max_recursion_level`` to something different than 0, you
318
+ might end-up with thousands of thousands of jobs. Please READ THE WHOLE
319
+ DOCUMENTATION before using this parameter, as it will be explained in more
320
+ detail.
321
+
322
+
323
+ But beware! This will submit 245 (5 times 49) jobs for each outer fold. For a
324
+ 5 times 5-fold CV, this means 6125 jobs. This can be a lot of jobs, but not
325
+ for HTCondor. It is though an issue with the data transfer. If each job requires
326
+ 500 MB of data, this means 3.1 TB of data transfer, which requires 3.1 TB of
327
+ disk space in the shared directory.
328
+
329
+
330
+ Throttling
331
+ ~~~~~~~~~~
332
+
333
+ Indeed, even if we queue all of the previous 6125 jobs at once, we are also
334
+ limited by the number of slots in the cluster. We can *throttle* the number of
335
+ jobs that are submitted at once by setting the ``throttle`` parameter. This
336
+ parameter specifies the number of jobs that can be either running or queued at
337
+ the same time, thus also limiting the number of files in the shared directory.
338
+
339
+ `joblib-htcondor`_ will submit jobs until the throttle is reached, and then it
340
+ will wait until a job finishes to submit a new one. The complicated part is
341
+ that with recursive parallelization, the effect of throttling can be quite
342
+ complex. If you have a throttle of 100, and you have 5 outer folds, each with
343
+ 5 inner folds, and each inner fold with 49 hyperparameter combinations, you
344
+ will have 6125 jobs. If you throttle at 100, you will have 100 jobs running at
345
+ the same time, but you will have 2525 jobs queued or running, since the
346
+ throttle is applied to the inner folds independently.
347
+
348
+ We can also set the throttle parameter by level, by specifying a list of
349
+ throttles. For example, to throttle at 25 for the outer folds and at 50 for
350
+ the inner folds, we can do the following:
351
+
352
+
353
+ .. code-block:: python
354
+
355
+ from joblib import parallel_config
356
+ from joblib_htcondor import register_htcondor
357
+
358
+ register_htcondor("INFO")
359
+
360
+ creator = PipelineCreator(problem_type="classification")
361
+ creator.add("zscore")
362
+ creator.add("svm")
363
+
364
+ with parallel_config(
365
+ backend="htcondor",
366
+ n_jobs=-1,
367
+ request_cpus=1,
368
+ request_mem="4Gb",
369
+ shared_data_dir="/data/project/supercool_research",
370
+ pool="head2.htc.inm7.de",
371
+ max_recursion_level=1,
372
+ throttle=[25, 50],
373
+ ):
374
+ scores = run_cross_validation(
375
+ X=X,
376
+ y=y,
377
+ X_types=X_types,
378
+ data=df_data,
379
+ model=creator,
380
+ problem_type="classification",
381
+ cv=cv,
382
+ )
383
+
384
+
385
+ This indeed will create a maximum of 25 jobs for the outer folds and 50 jobs
386
+ for the inner folds, resulting in 1275 jobs in total. If each job requires 500
387
+ MB of data, this means 637.5 GB of disk space in the shared directory.
388
+
389
+ Overhead
390
+ ~~~~~~~~
391
+
392
+ The overhead of submitting jobs to a cluster is not negligible. Each job
393
+ submission requires some time, and the data transfer also requires some time.
394
+ This overhead can be quite significant, and it is important to take it into
395
+ account when parallelizing your code. The `joblib-htcondor`_ backend is
396
+ intended to be used for large computations, where the overhead is negligible
397
+ with respect to the computation.
398
+
399
+ Scikit-learn parallelization
400
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
401
+
402
+ The rule is quite simple, if it has an ``n_jobs`` parameter, it can be
403
+ parallelized using joblib. This is the case for most of the scikit-learn's
404
+ algorithms. While the developers of scikit-learn are doing a great job and
405
+ currently working on documenting how this is done for each algorithm, this is
406
+ still not that evident.
407
+
408
+ Most importantly, the default is always ``n_jobs=-1``. This means that it will
409
+ use joblib for everything. In combination with `joblib-htcondor`_, this can
410
+ be a big issue. Thus, the recommendation is to always set the ``n_jobs``
411
+ parameter to ``1`` for every learning algorithm that you use, unless is an
412
+ ensemble in which every estimator takes hours to fit.
413
+
414
+ A clear example on when *NOT* to use ``n_jobs=-1`` is when using a
415
+ :external:class:`~sklearn.ensemble.RandomForestClassifier`. If left as default,
416
+ this will queue one job for each decision tree, resulting in hundreds of
417
+ relatively small jobs which will take a lot of time to complete, given the
418
+ overhead.
419
+
420
+
421
+ The following is a non-exhaustive list of scikit-learn's algorithms that might
422
+ make sense to set the ``n_jobs`` parameter to ``-1`` or leave as default:
423
+
424
+ * :external:class:`~sklearn.ensemble.StackingClassifier` and
425
+ :external:class:`~sklearn.ensemble.StackingRegressor`
426
+
427
+ This model will first fit the base estimators in parallel for the whole data.
428
+ Then it will fit and score the base estimators in parallel for each of the
429
+ internal CV folds, to generate the meta-features.
430
+
431
+ * :external:class:`~sklearn.ensemble.VotingClassifier` and
432
+ :external:class:`~sklearn.ensemble.VotingRegressor`
433
+
434
+ Similar to the stacking models.
435
+
436
+ * Hyperparameter searchers
437
+
438
+ Most of the hyperparameter searchers in scikit-learn will parallelize the
439
+ search for the best hyperparameters. Either at the internal CV level or at
440
+ both the hyperparameter search space and the internal CV level, depending
441
+ on the searcher.
442
+
443
+
444
+ Visualizing Progress
445
+ ~~~~~~~~~~~~~~~~~~~~
446
+
447
+ When submitting jobs to a cluster, it is important to know the progress of the
448
+ computation. The `joblib-htcondor`_ package has a text-based user interface
449
+ that allows to monitor the progress of the computation, an important feature
450
+ when submitting several recursive parallel jobs.
451
+
452
+ First, we need to set the ``export_metadata`` parameter to ``True`` in the
453
+ ``parallel_config`` context manager. This will create medatada files that
454
+ will be used by the UI.
455
+
456
+ .. code-block:: python
457
+
458
+ from joblib import parallel_config
459
+ from joblib_htcondor import register_htcondor
460
+
461
+ register_htcondor("INFO")
462
+
463
+ creator = PipelineCreator(problem_type="classification")
464
+ creator.add("zscore")
465
+ creator.add("svm")
466
+
467
+ with parallel_config(
468
+ backend="htcondor",
469
+ n_jobs=-1,
470
+ request_cpus=1,
471
+ request_mem="4Gb",
472
+ shared_data_dir="/data/project/supercool_research",
473
+ pool="head2.htc.inm7.de",
474
+ max_recursion_level=1,
475
+ throttle=[25, 50],
476
+ export_metadata=True,
477
+ ):
478
+ scores = run_cross_validation(
479
+ X=X,
480
+ y=y,
481
+ X_types=X_types,
482
+ data=df_data,
483
+ model=creator,
484
+ problem_type="classification",
485
+ cv=cv,
486
+ )
487
+
488
+
489
+ Once the jobs are submitted and running, we will see something like this with
490
+ the ``condor_q`` command:
491
+
492
+ .. image:: ../images/joblib_htcondor/condor_q.png
493
+ :alt: condor_q
494
+
495
+
496
+ The first row is the original job that is running `run_cross_validation`. The
497
+ rest are joblib-htcondor jobs.
498
+
499
+ Now this is difficult to understand, so we can use the `joblib-htcondor`_ UI,
500
+ pointing to the shared directory:
501
+
502
+ .. code-block:: bash
503
+
504
+ python -m joblib_htcondor.ui --path /data/project/supercool_research
505
+
506
+
507
+ While in the main window, we can press the ``O`` key to open a *root* file.
508
+ That is, the file that contains the metadata of the first batch of jobs that
509
+ triggered the rest of the jobs:
510
+
511
+ .. image:: ../images/joblib_htcondor/ui_open.png
512
+ :alt: Open Menu
513
+
514
+ Once we press ``ENTER``, we will see then see the following:
515
+
516
+ .. image:: ../images/joblib_htcondor/ui_main.png
517
+ :alt: Main UI
518
+
519
+ The top row displays a summary per level of recursion. In this case, we have
520
+ two levels of recursion. The first level is the outer folds (5 times 5-fold)
521
+ and full final model fitting, and the second level is a Grid Search
522
+ (110 hyperparameter combinations, 5-fold CV). The throttle is set to 26 for
523
+ the outer fold and 60 for the inner folds.
524
+
525
+ On the bottom, we an see that it has been running for almost 4 days and
526
+ we are using 10% of the 1.3 Tb of shared disk space.
527
+
528
+
529
+ A more complex model might look like this:
530
+
531
+ .. image:: ../images/joblib_htcondor/ui_stacked.png
532
+ :alt: Stacked UI
533
+
534
+
535
+ This is a stacked model with a hyperparameter search created as follows:
536
+
537
+ .. code-block:: python
538
+
539
+ # Create the pipeline for the sepal features, by default will apply to "sepal"
540
+ model_sepal = PipelineCreator(problem_type="classification", apply_to="sepal")
541
+ model_sepal.add("filter_columns", apply_to="*", keep="sepal")
542
+ model_sepal.add("zscore")
543
+ model_sepal.add("svm", C=[0.1, 1])
544
+
545
+ # Create the pipeline for the petal features, by default will apply to "petal"
546
+ model_petal = PipelineCreator(problem_type="classification", apply_to="petal")
547
+ model_petal.add("filter_columns", apply_to="*", keep="petal")
548
+ model_petal.add("zscore")
549
+ model_petal.add("rf", n_jobs=1)
550
+
551
+ # Create the stacking model
552
+ model = PipelineCreator(problem_type="classification")
553
+ model.add(
554
+ "stacking",
555
+ estimators=[[("model_sepal", model_sepal), ("model_petal", model_petal)]],
556
+ final_estimator=LogisticRegression(n_jobs=1),
557
+ apply_to="*",
558
+ cv=4,
559
+ )
560
+
561
+
562
+ As you can see, even the
563
+ :external:class:`~sklearn.linear_model.LogisticRegression` model has an
564
+ ``n_jobs`` parameter that must be set to ``1``!
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '0.3.4.dev43'
16
- __version_tuple__ = version_tuple = (0, 3, 4, 'dev43')
15
+ __version__ = version = '0.3.5.dev16'
16
+ __version_tuple__ = version_tuple = (0, 3, 5, 'dev16')