upgini 1.1.168a1__py3-none-any.whl → 1.1.169a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/metrics.py +109 -8
- upgini/utils/sklearn_ext.py +1217 -0
- {upgini-1.1.168a1.dist-info → upgini-1.1.169a1.dist-info}/METADATA +1 -1
- {upgini-1.1.168a1.dist-info → upgini-1.1.169a1.dist-info}/RECORD +7 -6
- {upgini-1.1.168a1.dist-info → upgini-1.1.169a1.dist-info}/LICENSE +0 -0
- {upgini-1.1.168a1.dist-info → upgini-1.1.169a1.dist-info}/WHEEL +0 -0
- {upgini-1.1.168a1.dist-info → upgini-1.1.169a1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1217 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import logging
|
|
3
|
+
import numbers
|
|
4
|
+
import time
|
|
5
|
+
import warnings
|
|
6
|
+
from collections import Counter
|
|
7
|
+
from contextlib import suppress
|
|
8
|
+
from itertools import compress
|
|
9
|
+
from traceback import format_exc
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
import scipy.sparse as sp
|
|
13
|
+
from catboost import CatBoostClassifier, CatBoostRegressor
|
|
14
|
+
from joblib import Parallel, logger
|
|
15
|
+
from scipy.sparse import issparse
|
|
16
|
+
from sklearn import config_context, get_config
|
|
17
|
+
from sklearn.base import clone, is_classifier
|
|
18
|
+
from sklearn.exceptions import FitFailedWarning, NotFittedError
|
|
19
|
+
from sklearn.metrics import check_scoring
|
|
20
|
+
from sklearn.metrics._scorer import _MultimetricScorer
|
|
21
|
+
from sklearn.model_selection import check_cv
|
|
22
|
+
from sklearn.utils.fixes import np_version, parse_version
|
|
23
|
+
from sklearn.utils.validation import indexable
|
|
24
|
+
|
|
25
|
+
_DEFAULT_TAGS = {
|
|
26
|
+
"non_deterministic": False,
|
|
27
|
+
"requires_positive_X": False,
|
|
28
|
+
"requires_positive_y": False,
|
|
29
|
+
"X_types": ["2darray"],
|
|
30
|
+
"poor_score": False,
|
|
31
|
+
"no_validation": False,
|
|
32
|
+
"multioutput": False,
|
|
33
|
+
"allow_nan": False,
|
|
34
|
+
"stateless": False,
|
|
35
|
+
"multilabel": False,
|
|
36
|
+
"_skip_test": False,
|
|
37
|
+
"_xfail_checks": False,
|
|
38
|
+
"multioutput_only": False,
|
|
39
|
+
"binary_only": False,
|
|
40
|
+
"requires_fit": True,
|
|
41
|
+
"preserves_dtype": [np.float64],
|
|
42
|
+
"requires_y": False,
|
|
43
|
+
"pairwise": False,
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def cross_validate(
|
|
48
|
+
estimator,
|
|
49
|
+
X,
|
|
50
|
+
y=None,
|
|
51
|
+
*,
|
|
52
|
+
groups=None,
|
|
53
|
+
scoring=None,
|
|
54
|
+
cv=None,
|
|
55
|
+
n_jobs=None,
|
|
56
|
+
verbose=0,
|
|
57
|
+
fit_params=None,
|
|
58
|
+
pre_dispatch="2*n_jobs",
|
|
59
|
+
return_train_score=False,
|
|
60
|
+
return_estimator=False,
|
|
61
|
+
error_score=np.nan,
|
|
62
|
+
):
|
|
63
|
+
"""Evaluate metric(s) by cross-validation and also record fit/score times.
|
|
64
|
+
|
|
65
|
+
Read more in the :ref:`User Guide <multimetric_cross_validation>`.
|
|
66
|
+
|
|
67
|
+
Parameters
|
|
68
|
+
----------
|
|
69
|
+
estimator : estimator object implementing 'fit'
|
|
70
|
+
The object to use to fit the data.
|
|
71
|
+
|
|
72
|
+
X : array-like of shape (n_samples, n_features)
|
|
73
|
+
The data to fit. Can be for example a list, or an array.
|
|
74
|
+
|
|
75
|
+
y : array-like of shape (n_samples,) or (n_samples, n_outputs), \
|
|
76
|
+
default=None
|
|
77
|
+
The target variable to try to predict in the case of
|
|
78
|
+
supervised learning.
|
|
79
|
+
|
|
80
|
+
groups : array-like of shape (n_samples,), default=None
|
|
81
|
+
Group labels for the samples used while splitting the dataset into
|
|
82
|
+
train/test set. Only used in conjunction with a "Group" :term:`cv`
|
|
83
|
+
instance (e.g., :class:`GroupKFold`).
|
|
84
|
+
|
|
85
|
+
scoring : str, callable, list, tuple, or dict, default=None
|
|
86
|
+
Strategy to evaluate the performance of the cross-validated model on
|
|
87
|
+
the test set.
|
|
88
|
+
|
|
89
|
+
If `scoring` represents a single score, one can use:
|
|
90
|
+
|
|
91
|
+
- a single string (see :ref:`scoring_parameter`);
|
|
92
|
+
- a callable (see :ref:`scoring`) that returns a single value.
|
|
93
|
+
|
|
94
|
+
If `scoring` represents multiple scores, one can use:
|
|
95
|
+
|
|
96
|
+
- a list or tuple of unique strings;
|
|
97
|
+
- a callable returning a dictionary where the keys are the metric
|
|
98
|
+
names and the values are the metric scores;
|
|
99
|
+
- a dictionary with metric names as keys and callables a values.
|
|
100
|
+
|
|
101
|
+
See :ref:`multimetric_grid_search` for an example.
|
|
102
|
+
|
|
103
|
+
cv : int, cross-validation generator or an iterable, default=None
|
|
104
|
+
Determines the cross-validation splitting strategy.
|
|
105
|
+
Possible inputs for cv are:
|
|
106
|
+
|
|
107
|
+
- None, to use the default 5-fold cross validation,
|
|
108
|
+
- int, to specify the number of folds in a `(Stratified)KFold`,
|
|
109
|
+
- :term:`CV splitter`,
|
|
110
|
+
- An iterable yielding (train, test) splits as arrays of indices.
|
|
111
|
+
|
|
112
|
+
For int/None inputs, if the estimator is a classifier and ``y`` is
|
|
113
|
+
either binary or multiclass, :class:`StratifiedKFold` is used. In all
|
|
114
|
+
other cases, :class:`.Fold` is used. These splitters are instantiated
|
|
115
|
+
with `shuffle=False` so the splits will be the same across calls.
|
|
116
|
+
|
|
117
|
+
Refer :ref:`User Guide <cross_validation>` for the various
|
|
118
|
+
cross-validation strategies that can be used here.
|
|
119
|
+
|
|
120
|
+
.. versionchanged:: 0.22
|
|
121
|
+
``cv`` default value if None changed from 3-fold to 5-fold.
|
|
122
|
+
|
|
123
|
+
n_jobs : int, default=None
|
|
124
|
+
Number of jobs to run in parallel. Training the estimator and computing
|
|
125
|
+
the score are parallelized over the cross-validation splits.
|
|
126
|
+
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
|
127
|
+
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
|
128
|
+
for more details.
|
|
129
|
+
|
|
130
|
+
verbose : int, default=0
|
|
131
|
+
The verbosity level.
|
|
132
|
+
|
|
133
|
+
fit_params : dict, default=None
|
|
134
|
+
Parameters to pass to the fit method of the estimator.
|
|
135
|
+
|
|
136
|
+
pre_dispatch : int or str, default='2*n_jobs'
|
|
137
|
+
Controls the number of jobs that get dispatched during parallel
|
|
138
|
+
execution. Reducing this number can be useful to avoid an
|
|
139
|
+
explosion of memory consumption when more jobs get dispatched
|
|
140
|
+
than CPUs can process. This parameter can be:
|
|
141
|
+
|
|
142
|
+
- None, in which case all the jobs are immediately
|
|
143
|
+
created and spawned. Use this for lightweight and
|
|
144
|
+
fast-running jobs, to avoid delays due to on-demand
|
|
145
|
+
spawning of the jobs
|
|
146
|
+
|
|
147
|
+
- An int, giving the exact number of total jobs that are
|
|
148
|
+
spawned
|
|
149
|
+
|
|
150
|
+
- A str, giving an expression as a function of n_jobs,
|
|
151
|
+
as in '2*n_jobs'
|
|
152
|
+
|
|
153
|
+
return_train_score : bool, default=False
|
|
154
|
+
Whether to include train scores.
|
|
155
|
+
Computing training scores is used to get insights on how different
|
|
156
|
+
parameter settings impact the overfitting/underfitting trade-off.
|
|
157
|
+
However computing the scores on the training set can be computationally
|
|
158
|
+
expensive and is not strictly required to select the parameters that
|
|
159
|
+
yield the best generalization performance.
|
|
160
|
+
|
|
161
|
+
.. versionadded:: 0.19
|
|
162
|
+
|
|
163
|
+
.. versionchanged:: 0.21
|
|
164
|
+
Default value was changed from ``True`` to ``False``
|
|
165
|
+
|
|
166
|
+
return_estimator : bool, default=False
|
|
167
|
+
Whether to return the estimators fitted on each split.
|
|
168
|
+
|
|
169
|
+
.. versionadded:: 0.20
|
|
170
|
+
|
|
171
|
+
error_score : 'raise' or numeric, default=np.nan
|
|
172
|
+
Value to assign to the score if an error occurs in estimator fitting.
|
|
173
|
+
If set to 'raise', the error is raised.
|
|
174
|
+
If a numeric value is given, FitFailedWarning is raised.
|
|
175
|
+
|
|
176
|
+
.. versionadded:: 0.20
|
|
177
|
+
|
|
178
|
+
Returns
|
|
179
|
+
-------
|
|
180
|
+
scores : dict of float arrays of shape (n_splits,)
|
|
181
|
+
Array of scores of the estimator for each run of the cross validation.
|
|
182
|
+
|
|
183
|
+
A dict of arrays containing the score/time arrays for each scorer is
|
|
184
|
+
returned. The possible keys for this ``dict`` are:
|
|
185
|
+
|
|
186
|
+
``test_score``
|
|
187
|
+
The score array for test scores on each cv split.
|
|
188
|
+
Suffix ``_score`` in ``test_score`` changes to a specific
|
|
189
|
+
metric like ``test_r2`` or ``test_auc`` if there are
|
|
190
|
+
multiple scoring metrics in the scoring parameter.
|
|
191
|
+
``train_score``
|
|
192
|
+
The score array for train scores on each cv split.
|
|
193
|
+
Suffix ``_score`` in ``train_score`` changes to a specific
|
|
194
|
+
metric like ``train_r2`` or ``train_auc`` if there are
|
|
195
|
+
multiple scoring metrics in the scoring parameter.
|
|
196
|
+
This is available only if ``return_train_score`` parameter
|
|
197
|
+
is ``True``.
|
|
198
|
+
``fit_time``
|
|
199
|
+
The time for fitting the estimator on the train
|
|
200
|
+
set for each cv split.
|
|
201
|
+
``score_time``
|
|
202
|
+
The time for scoring the estimator on the test set for each
|
|
203
|
+
cv split. (Note time for scoring on the train set is not
|
|
204
|
+
included even if ``return_train_score`` is set to ``True``
|
|
205
|
+
``estimator``
|
|
206
|
+
The estimator objects for each cv split.
|
|
207
|
+
This is available only if ``return_estimator`` parameter
|
|
208
|
+
is set to ``True``.
|
|
209
|
+
|
|
210
|
+
Examples
|
|
211
|
+
--------
|
|
212
|
+
>>> from sklearn import datasets, linear_model
|
|
213
|
+
>>> from sklearn.model_selection import cross_validate
|
|
214
|
+
>>> from sklearn.metrics import make_scorer
|
|
215
|
+
>>> from sklearn.metrics import confusion_matrix
|
|
216
|
+
>>> from sklearn.svm import LinearSVC
|
|
217
|
+
>>> diabetes = datasets.load_diabetes()
|
|
218
|
+
>>> X = diabetes.data[:150]
|
|
219
|
+
>>> y = diabetes.target[:150]
|
|
220
|
+
>>> lasso = linear_model.Lasso()
|
|
221
|
+
|
|
222
|
+
Single metric evaluation using ``cross_validate``
|
|
223
|
+
|
|
224
|
+
>>> cv_results = cross_validate(lasso, X, y, cv=3)
|
|
225
|
+
>>> sorted(cv_results.keys())
|
|
226
|
+
['fit_time', 'score_time', 'test_score']
|
|
227
|
+
>>> cv_results['test_score']
|
|
228
|
+
array([0.33150734, 0.08022311, 0.03531764])
|
|
229
|
+
|
|
230
|
+
Multiple metric evaluation using ``cross_validate``
|
|
231
|
+
(please refer the ``scoring`` parameter doc for more information)
|
|
232
|
+
|
|
233
|
+
>>> scores = cross_validate(lasso, X, y, cv=3,
|
|
234
|
+
... scoring=('r2', 'neg_mean_squared_error'),
|
|
235
|
+
... return_train_score=True)
|
|
236
|
+
>>> print(scores['test_neg_mean_squared_error'])
|
|
237
|
+
[-3635.5... -3573.3... -6114.7...]
|
|
238
|
+
>>> print(scores['train_r2'])
|
|
239
|
+
[0.28010158 0.39088426 0.22784852]
|
|
240
|
+
|
|
241
|
+
See Also
|
|
242
|
+
---------
|
|
243
|
+
cross_val_score : Run cross-validation for single metric evaluation.
|
|
244
|
+
|
|
245
|
+
cross_val_predict : Get predictions from each split of cross-validation for
|
|
246
|
+
diagnostic purposes.
|
|
247
|
+
|
|
248
|
+
sklearn.metrics.make_scorer : Make a scorer from a performance metric or
|
|
249
|
+
loss function.
|
|
250
|
+
|
|
251
|
+
"""
|
|
252
|
+
try:
|
|
253
|
+
X, y, groups = indexable(X, y, groups)
|
|
254
|
+
|
|
255
|
+
cv = check_cv(cv, y, classifier=is_classifier(estimator))
|
|
256
|
+
|
|
257
|
+
if callable(scoring):
|
|
258
|
+
scorers = scoring
|
|
259
|
+
elif scoring is None or isinstance(scoring, str):
|
|
260
|
+
scorers = check_scoring(estimator, scoring)
|
|
261
|
+
else:
|
|
262
|
+
scorers = _check_multimetric_scoring(estimator, scoring)
|
|
263
|
+
|
|
264
|
+
# We clone the estimator to make sure that all the folds are
|
|
265
|
+
# independent, and that it is pickle-able.
|
|
266
|
+
parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
|
|
267
|
+
results = parallel(
|
|
268
|
+
delayed(_fit_and_score)(
|
|
269
|
+
clone(estimator),
|
|
270
|
+
X,
|
|
271
|
+
y,
|
|
272
|
+
scorers,
|
|
273
|
+
train,
|
|
274
|
+
test,
|
|
275
|
+
verbose,
|
|
276
|
+
None,
|
|
277
|
+
fit_params,
|
|
278
|
+
return_train_score=return_train_score,
|
|
279
|
+
return_times=True,
|
|
280
|
+
return_estimator=return_estimator,
|
|
281
|
+
error_score=error_score,
|
|
282
|
+
)
|
|
283
|
+
for train, test in cv.split(X, y, groups)
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
_warn_about_fit_failures(results, error_score)
|
|
287
|
+
|
|
288
|
+
# For callabe scoring, the return type is only know after calling. If the
|
|
289
|
+
# return type is a dictionary, the error scores can now be inserted with
|
|
290
|
+
# the correct key.
|
|
291
|
+
if callable(scoring):
|
|
292
|
+
_insert_error_scores(results, error_score)
|
|
293
|
+
|
|
294
|
+
results = _aggregate_score_dicts(results)
|
|
295
|
+
|
|
296
|
+
ret = {}
|
|
297
|
+
ret["fit_time"] = results["fit_time"]
|
|
298
|
+
ret["score_time"] = results["score_time"]
|
|
299
|
+
|
|
300
|
+
if return_estimator:
|
|
301
|
+
ret["estimator"] = results["estimator"]
|
|
302
|
+
|
|
303
|
+
test_scores_dict = _normalize_score_results(results["test_scores"])
|
|
304
|
+
if return_train_score:
|
|
305
|
+
train_scores_dict = _normalize_score_results(results["train_scores"])
|
|
306
|
+
|
|
307
|
+
for name in test_scores_dict:
|
|
308
|
+
ret["test_%s" % name] = test_scores_dict[name]
|
|
309
|
+
if return_train_score:
|
|
310
|
+
key = "train_%s" % name
|
|
311
|
+
ret[key] = train_scores_dict[name]
|
|
312
|
+
|
|
313
|
+
return ret
|
|
314
|
+
except Exception:
|
|
315
|
+
logging.exception("Failed to execute overriden cross_validate. Fallback to original")
|
|
316
|
+
return cross_validate(
|
|
317
|
+
estimator,
|
|
318
|
+
X,
|
|
319
|
+
y,
|
|
320
|
+
groups=groups,
|
|
321
|
+
scoring=scoring,
|
|
322
|
+
cv=cv,
|
|
323
|
+
n_jobs=n_jobs,
|
|
324
|
+
verbose=verbose,
|
|
325
|
+
fit_params=fit_params,
|
|
326
|
+
pre_dispatch=pre_dispatch,
|
|
327
|
+
return_train_score=return_train_score,
|
|
328
|
+
return_estimator=return_estimator,
|
|
329
|
+
error_score=error_score,
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def _fit_and_score(
|
|
334
|
+
estimator,
|
|
335
|
+
X,
|
|
336
|
+
y,
|
|
337
|
+
scorer,
|
|
338
|
+
train,
|
|
339
|
+
test,
|
|
340
|
+
verbose,
|
|
341
|
+
parameters,
|
|
342
|
+
fit_params,
|
|
343
|
+
return_train_score=False,
|
|
344
|
+
return_parameters=False,
|
|
345
|
+
return_n_test_samples=False,
|
|
346
|
+
return_times=False,
|
|
347
|
+
return_estimator=False,
|
|
348
|
+
split_progress=None,
|
|
349
|
+
candidate_progress=None,
|
|
350
|
+
error_score=np.nan,
|
|
351
|
+
):
|
|
352
|
+
"""Fit estimator and compute scores for a given dataset split.
|
|
353
|
+
|
|
354
|
+
Parameters
|
|
355
|
+
----------
|
|
356
|
+
estimator : estimator object implementing 'fit'
|
|
357
|
+
The object to use to fit the data.
|
|
358
|
+
|
|
359
|
+
X : array-like of shape (n_samples, n_features)
|
|
360
|
+
The data to fit.
|
|
361
|
+
|
|
362
|
+
y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None
|
|
363
|
+
The target variable to try to predict in the case of
|
|
364
|
+
supervised learning.
|
|
365
|
+
|
|
366
|
+
scorer : A single callable or dict mapping scorer name to the callable
|
|
367
|
+
If it is a single callable, the return value for ``train_scores`` and
|
|
368
|
+
``test_scores`` is a single float.
|
|
369
|
+
|
|
370
|
+
For a dict, it should be one mapping the scorer name to the scorer
|
|
371
|
+
callable object / function.
|
|
372
|
+
|
|
373
|
+
The callable object / fn should have signature
|
|
374
|
+
``scorer(estimator, X, y)``.
|
|
375
|
+
|
|
376
|
+
train : array-like of shape (n_train_samples,)
|
|
377
|
+
Indices of training samples.
|
|
378
|
+
|
|
379
|
+
test : array-like of shape (n_test_samples,)
|
|
380
|
+
Indices of test samples.
|
|
381
|
+
|
|
382
|
+
verbose : int
|
|
383
|
+
The verbosity level.
|
|
384
|
+
|
|
385
|
+
error_score : 'raise' or numeric, default=np.nan
|
|
386
|
+
Value to assign to the score if an error occurs in estimator fitting.
|
|
387
|
+
If set to 'raise', the error is raised.
|
|
388
|
+
If a numeric value is given, FitFailedWarning is raised.
|
|
389
|
+
|
|
390
|
+
parameters : dict or None
|
|
391
|
+
Parameters to be set on the estimator.
|
|
392
|
+
|
|
393
|
+
fit_params : dict or None
|
|
394
|
+
Parameters that will be passed to ``estimator.fit``.
|
|
395
|
+
|
|
396
|
+
return_train_score : bool, default=False
|
|
397
|
+
Compute and return score on training set.
|
|
398
|
+
|
|
399
|
+
return_parameters : bool, default=False
|
|
400
|
+
Return parameters that has been used for the estimator.
|
|
401
|
+
|
|
402
|
+
split_progress : {list, tuple} of int, default=None
|
|
403
|
+
A list or tuple of format (<current_split_id>, <total_num_of_splits>).
|
|
404
|
+
|
|
405
|
+
candidate_progress : {list, tuple} of int, default=None
|
|
406
|
+
A list or tuple of format
|
|
407
|
+
(<current_candidate_id>, <total_number_of_candidates>).
|
|
408
|
+
|
|
409
|
+
return_n_test_samples : bool, default=False
|
|
410
|
+
Whether to return the ``n_test_samples``.
|
|
411
|
+
|
|
412
|
+
return_times : bool, default=False
|
|
413
|
+
Whether to return the fit/score times.
|
|
414
|
+
|
|
415
|
+
return_estimator : bool, default=False
|
|
416
|
+
Whether to return the fitted estimator.
|
|
417
|
+
|
|
418
|
+
Returns
|
|
419
|
+
-------
|
|
420
|
+
result : dict with the following attributes
|
|
421
|
+
train_scores : dict of scorer name -> float
|
|
422
|
+
Score on training set (for all the scorers),
|
|
423
|
+
returned only if `return_train_score` is `True`.
|
|
424
|
+
test_scores : dict of scorer name -> float
|
|
425
|
+
Score on testing set (for all the scorers).
|
|
426
|
+
n_test_samples : int
|
|
427
|
+
Number of test samples.
|
|
428
|
+
fit_time : float
|
|
429
|
+
Time spent for fitting in seconds.
|
|
430
|
+
score_time : float
|
|
431
|
+
Time spent for scoring in seconds.
|
|
432
|
+
parameters : dict or None
|
|
433
|
+
The parameters that have been evaluated.
|
|
434
|
+
estimator : estimator object
|
|
435
|
+
The fitted estimator.
|
|
436
|
+
fit_error : str or None
|
|
437
|
+
Traceback str if the fit failed, None if the fit succeeded.
|
|
438
|
+
"""
|
|
439
|
+
|
|
440
|
+
if not isinstance(error_score, numbers.Number) and error_score != "raise":
|
|
441
|
+
raise ValueError(
|
|
442
|
+
"error_score must be the string 'raise' or a numeric value. "
|
|
443
|
+
"(Hint: if using 'raise', please make sure that it has been "
|
|
444
|
+
"spelled correctly.)"
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
progress_msg = ""
|
|
448
|
+
if verbose > 2:
|
|
449
|
+
if split_progress is not None:
|
|
450
|
+
progress_msg = f" {split_progress[0]+1}/{split_progress[1]}"
|
|
451
|
+
if candidate_progress and verbose > 9:
|
|
452
|
+
progress_msg += f"; {candidate_progress[0]+1}/{candidate_progress[1]}"
|
|
453
|
+
|
|
454
|
+
if verbose > 1:
|
|
455
|
+
if parameters is None:
|
|
456
|
+
params_msg = ""
|
|
457
|
+
else:
|
|
458
|
+
sorted_keys = sorted(parameters) # Ensure deterministic o/p
|
|
459
|
+
params_msg = ", ".join(f"{k}={parameters[k]}" for k in sorted_keys)
|
|
460
|
+
if verbose > 9:
|
|
461
|
+
start_msg = f"[CV{progress_msg}] START {params_msg}"
|
|
462
|
+
print(f"{start_msg}{(80 - len(start_msg)) * '.'}")
|
|
463
|
+
|
|
464
|
+
# Adjust length of sample weights
|
|
465
|
+
fit_params = fit_params if fit_params is not None else {}
|
|
466
|
+
fit_params = _check_fit_params(X, fit_params, train)
|
|
467
|
+
|
|
468
|
+
if parameters is not None:
|
|
469
|
+
# clone after setting parameters in case any parameters
|
|
470
|
+
# are estimators (like pipeline steps)
|
|
471
|
+
# because pipeline doesn't clone steps in fit
|
|
472
|
+
cloned_parameters = {}
|
|
473
|
+
for k, v in parameters.items():
|
|
474
|
+
cloned_parameters[k] = clone(v, safe=False)
|
|
475
|
+
|
|
476
|
+
estimator = estimator.set_params(**cloned_parameters)
|
|
477
|
+
|
|
478
|
+
start_time = time.time()
|
|
479
|
+
|
|
480
|
+
X_train, y_train = _safe_split(estimator, X, y, train)
|
|
481
|
+
X_test, y_test = _safe_split(estimator, X, y, test, train)
|
|
482
|
+
|
|
483
|
+
result = {}
|
|
484
|
+
try:
|
|
485
|
+
if y_train is None:
|
|
486
|
+
estimator.fit(X_train, **fit_params)
|
|
487
|
+
else:
|
|
488
|
+
if isinstance(estimator, CatBoostClassifier) or isinstance(estimator, CatBoostRegressor):
|
|
489
|
+
fit_params = fit_params.copy()
|
|
490
|
+
fit_params["eval_set"] = [(X_test, y_test)]
|
|
491
|
+
estimator.fit(X_train, y_train, **fit_params)
|
|
492
|
+
|
|
493
|
+
except Exception:
|
|
494
|
+
# Note fit time as time until error
|
|
495
|
+
fit_time = time.time() - start_time
|
|
496
|
+
score_time = 0.0
|
|
497
|
+
if error_score == "raise":
|
|
498
|
+
raise
|
|
499
|
+
elif isinstance(error_score, numbers.Number):
|
|
500
|
+
if isinstance(scorer, dict):
|
|
501
|
+
test_scores = {name: error_score for name in scorer}
|
|
502
|
+
if return_train_score:
|
|
503
|
+
train_scores = test_scores.copy()
|
|
504
|
+
else:
|
|
505
|
+
test_scores = error_score
|
|
506
|
+
if return_train_score:
|
|
507
|
+
train_scores = error_score
|
|
508
|
+
result["fit_error"] = format_exc()
|
|
509
|
+
else:
|
|
510
|
+
result["fit_error"] = None
|
|
511
|
+
|
|
512
|
+
fit_time = time.time() - start_time
|
|
513
|
+
test_scores = _score(estimator, X_test, y_test, scorer, error_score)
|
|
514
|
+
score_time = time.time() - start_time - fit_time
|
|
515
|
+
if return_train_score:
|
|
516
|
+
train_scores = _score(estimator, X_train, y_train, scorer, error_score)
|
|
517
|
+
|
|
518
|
+
if verbose > 1:
|
|
519
|
+
total_time = score_time + fit_time
|
|
520
|
+
end_msg = f"[CV{progress_msg}] END "
|
|
521
|
+
result_msg = params_msg + (";" if params_msg else "")
|
|
522
|
+
if verbose > 2:
|
|
523
|
+
if isinstance(test_scores, dict):
|
|
524
|
+
for scorer_name in sorted(test_scores):
|
|
525
|
+
result_msg += f" {scorer_name}: ("
|
|
526
|
+
if return_train_score:
|
|
527
|
+
scorer_scores = train_scores[scorer_name]
|
|
528
|
+
result_msg += f"train={scorer_scores:.3f}, "
|
|
529
|
+
result_msg += f"test={test_scores[scorer_name]:.3f})"
|
|
530
|
+
else:
|
|
531
|
+
result_msg += ", score="
|
|
532
|
+
if return_train_score:
|
|
533
|
+
result_msg += f"(train={train_scores:.3f}, test={test_scores:.3f})"
|
|
534
|
+
else:
|
|
535
|
+
result_msg += f"{test_scores:.3f}"
|
|
536
|
+
result_msg += f" total time={logger.short_format_time(total_time)}"
|
|
537
|
+
|
|
538
|
+
# Right align the result_msg
|
|
539
|
+
end_msg += "." * (80 - len(end_msg) - len(result_msg))
|
|
540
|
+
end_msg += result_msg
|
|
541
|
+
print(end_msg)
|
|
542
|
+
|
|
543
|
+
result["test_scores"] = test_scores
|
|
544
|
+
if return_train_score:
|
|
545
|
+
result["train_scores"] = train_scores
|
|
546
|
+
if return_n_test_samples:
|
|
547
|
+
result["n_test_samples"] = _num_samples(X_test)
|
|
548
|
+
if return_times:
|
|
549
|
+
result["fit_time"] = fit_time
|
|
550
|
+
result["score_time"] = score_time
|
|
551
|
+
if return_parameters:
|
|
552
|
+
result["parameters"] = parameters
|
|
553
|
+
if return_estimator:
|
|
554
|
+
result["estimator"] = estimator
|
|
555
|
+
return result
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
def _aggregate_score_dicts(scores):
|
|
559
|
+
"""Aggregate the list of dict to dict of np ndarray
|
|
560
|
+
|
|
561
|
+
The aggregated output of _aggregate_score_dicts will be a list of dict
|
|
562
|
+
of form [{'prec': 0.1, 'acc':1.0}, {'prec': 0.1, 'acc':1.0}, ...]
|
|
563
|
+
Convert it to a dict of array {'prec': np.array([0.1 ...]), ...}
|
|
564
|
+
|
|
565
|
+
Parameters
|
|
566
|
+
----------
|
|
567
|
+
|
|
568
|
+
scores : list of dict
|
|
569
|
+
List of dicts of the scores for all scorers. This is a flat list,
|
|
570
|
+
assumed originally to be of row major order.
|
|
571
|
+
|
|
572
|
+
Example
|
|
573
|
+
-------
|
|
574
|
+
|
|
575
|
+
>>> scores = [{'a': 1, 'b':10}, {'a': 2, 'b':2}, {'a': 3, 'b':3},
|
|
576
|
+
... {'a': 10, 'b': 10}] # doctest: +SKIP
|
|
577
|
+
>>> _aggregate_score_dicts(scores) # doctest: +SKIP
|
|
578
|
+
{'a': array([1, 2, 3, 10]),
|
|
579
|
+
'b': array([10, 2, 3, 10])}
|
|
580
|
+
"""
|
|
581
|
+
|
|
582
|
+
return {
|
|
583
|
+
key: np.asarray([score[key] for score in scores])
|
|
584
|
+
if isinstance(scores[0][key], numbers.Number)
|
|
585
|
+
else [score[key] for score in scores]
|
|
586
|
+
for key in scores[0]
|
|
587
|
+
}
|
|
588
|
+
|
|
589
|
+
|
|
590
|
+
def _insert_error_scores(results, error_score):
|
|
591
|
+
"""Insert error in `results` by replacing them inplace with `error_score`.
|
|
592
|
+
|
|
593
|
+
This only applies to multimetric scores because `_fit_and_score` will
|
|
594
|
+
handle the single metric case.
|
|
595
|
+
"""
|
|
596
|
+
|
|
597
|
+
successful_score = None
|
|
598
|
+
failed_indices = []
|
|
599
|
+
for i, result in enumerate(results):
|
|
600
|
+
if result["fit_error"] is not None:
|
|
601
|
+
failed_indices.append(i)
|
|
602
|
+
elif successful_score is None:
|
|
603
|
+
successful_score = result["test_scores"]
|
|
604
|
+
|
|
605
|
+
if successful_score is None:
|
|
606
|
+
raise NotFittedError("All estimators failed to fit")
|
|
607
|
+
|
|
608
|
+
if isinstance(successful_score, dict):
|
|
609
|
+
formatted_error = {name: error_score for name in successful_score}
|
|
610
|
+
for i in failed_indices:
|
|
611
|
+
results[i]["test_scores"] = formatted_error.copy()
|
|
612
|
+
if "train_scores" in results[i]:
|
|
613
|
+
results[i]["train_scores"] = formatted_error.copy()
|
|
614
|
+
|
|
615
|
+
|
|
616
|
+
def _warn_about_fit_failures(results, error_score):
|
|
617
|
+
fit_errors = [result["fit_error"] for result in results if result["fit_error"] is not None]
|
|
618
|
+
if fit_errors:
|
|
619
|
+
num_failed_fits = len(fit_errors)
|
|
620
|
+
num_fits = len(results)
|
|
621
|
+
fit_errors_counter = Counter(fit_errors)
|
|
622
|
+
delimiter = "-" * 80 + "\n"
|
|
623
|
+
fit_errors_summary = "\n".join(
|
|
624
|
+
f"{delimiter}{n} fits failed with the following error:\n{error}" for error, n in fit_errors_counter.items()
|
|
625
|
+
)
|
|
626
|
+
|
|
627
|
+
some_fits_failed_message = (
|
|
628
|
+
f"\n{num_failed_fits} fits failed out of a total of {num_fits}.\n"
|
|
629
|
+
"The score on these train-test partitions for these parameters"
|
|
630
|
+
f" will be set to {error_score}.\n"
|
|
631
|
+
"If these failures are not expected, you can try to debug them "
|
|
632
|
+
"by setting error_score='raise'.\n\n"
|
|
633
|
+
f"Below are more details about the failures:\n{fit_errors_summary}"
|
|
634
|
+
)
|
|
635
|
+
warnings.warn(some_fits_failed_message, FitFailedWarning, stacklevel=1)
|
|
636
|
+
|
|
637
|
+
|
|
638
|
+
def _normalize_score_results(scores, scaler_score_key="score"):
|
|
639
|
+
"""Creates a scoring dictionary based on the type of `scores`"""
|
|
640
|
+
if isinstance(scores[0], dict):
|
|
641
|
+
# multimetric scoring
|
|
642
|
+
return _aggregate_score_dicts(scores)
|
|
643
|
+
# scaler
|
|
644
|
+
return {scaler_score_key: scores}
|
|
645
|
+
|
|
646
|
+
|
|
647
|
+
def _check_multimetric_scoring(estimator, scoring):
|
|
648
|
+
"""Check the scoring parameter in cases when multiple metrics are allowed.
|
|
649
|
+
|
|
650
|
+
Parameters
|
|
651
|
+
----------
|
|
652
|
+
estimator : sklearn estimator instance
|
|
653
|
+
The estimator for which the scoring will be applied.
|
|
654
|
+
|
|
655
|
+
scoring : list, tuple or dict
|
|
656
|
+
Strategy to evaluate the performance of the cross-validated model on
|
|
657
|
+
the test set.
|
|
658
|
+
|
|
659
|
+
The possibilities are:
|
|
660
|
+
|
|
661
|
+
- a list or tuple of unique strings;
|
|
662
|
+
- a callable returning a dictionary where they keys are the metric
|
|
663
|
+
names and the values are the metric scores;
|
|
664
|
+
- a dictionary with metric names as keys and callables a values.
|
|
665
|
+
|
|
666
|
+
See :ref:`multimetric_grid_search` for an example.
|
|
667
|
+
|
|
668
|
+
Returns
|
|
669
|
+
-------
|
|
670
|
+
scorers_dict : dict
|
|
671
|
+
A dict mapping each scorer name to its validated scorer.
|
|
672
|
+
"""
|
|
673
|
+
err_msg_generic = (
|
|
674
|
+
f"scoring is invalid (got {scoring!r}). Refer to the "
|
|
675
|
+
"scoring glossary for details: "
|
|
676
|
+
"https://scikit-learn.org/stable/glossary.html#term-scoring"
|
|
677
|
+
)
|
|
678
|
+
|
|
679
|
+
if isinstance(scoring, (list, tuple, set)):
|
|
680
|
+
err_msg = "The list/tuple elements must be unique strings of predefined scorers. "
|
|
681
|
+
try:
|
|
682
|
+
keys = set(scoring)
|
|
683
|
+
except TypeError as e:
|
|
684
|
+
raise ValueError(err_msg) from e
|
|
685
|
+
|
|
686
|
+
if len(keys) != len(scoring):
|
|
687
|
+
raise ValueError(f"{err_msg} Duplicate elements were found in" f" the given list. {scoring!r}")
|
|
688
|
+
elif len(keys) > 0:
|
|
689
|
+
if not all(isinstance(k, str) for k in keys):
|
|
690
|
+
if any(callable(k) for k in keys):
|
|
691
|
+
raise ValueError(
|
|
692
|
+
f"{err_msg} One or more of the elements "
|
|
693
|
+
"were callables. Use a dict of score "
|
|
694
|
+
"name mapped to the scorer callable. "
|
|
695
|
+
f"Got {scoring!r}"
|
|
696
|
+
)
|
|
697
|
+
else:
|
|
698
|
+
raise ValueError(f"{err_msg} Non-string types were found " f"in the given list. Got {scoring!r}")
|
|
699
|
+
scorers = {scorer: check_scoring(estimator, scoring=scorer) for scorer in scoring}
|
|
700
|
+
else:
|
|
701
|
+
raise ValueError(f"{err_msg} Empty list was given. {scoring!r}")
|
|
702
|
+
|
|
703
|
+
elif isinstance(scoring, dict):
|
|
704
|
+
keys = set(scoring)
|
|
705
|
+
if not all(isinstance(k, str) for k in keys):
|
|
706
|
+
raise ValueError("Non-string types were found in the keys of " f"the given dict. scoring={scoring!r}")
|
|
707
|
+
if len(keys) == 0:
|
|
708
|
+
raise ValueError(f"An empty dict was passed. {scoring!r}")
|
|
709
|
+
scorers = {key: check_scoring(estimator, scoring=scorer) for key, scorer in scoring.items()}
|
|
710
|
+
else:
|
|
711
|
+
raise ValueError(err_msg_generic)
|
|
712
|
+
return scorers
|
|
713
|
+
|
|
714
|
+
|
|
715
|
+
def _score(estimator, X_test, y_test, scorer, error_score="raise"):
|
|
716
|
+
"""Compute the score(s) of an estimator on a given test set.
|
|
717
|
+
|
|
718
|
+
Will return a dict of floats if `scorer` is a dict, otherwise a single
|
|
719
|
+
float is returned.
|
|
720
|
+
"""
|
|
721
|
+
if isinstance(scorer, dict):
|
|
722
|
+
# will cache method calls if needed. scorer() returns a dict
|
|
723
|
+
scorer = _MultimetricScorer(**scorer)
|
|
724
|
+
|
|
725
|
+
try:
|
|
726
|
+
if y_test is None:
|
|
727
|
+
scores = scorer(estimator, X_test)
|
|
728
|
+
else:
|
|
729
|
+
scores = scorer(estimator, X_test, y_test)
|
|
730
|
+
except Exception:
|
|
731
|
+
if error_score == "raise":
|
|
732
|
+
raise
|
|
733
|
+
else:
|
|
734
|
+
if isinstance(scorer, _MultimetricScorer):
|
|
735
|
+
scores = {name: error_score for name in scorer._scorers}
|
|
736
|
+
else:
|
|
737
|
+
scores = error_score
|
|
738
|
+
warnings.warn(
|
|
739
|
+
"Scoring failed. The score on this train-test partition for "
|
|
740
|
+
f"these parameters will be set to {error_score}. Details: \n"
|
|
741
|
+
f"{format_exc()}",
|
|
742
|
+
UserWarning,
|
|
743
|
+
stacklevel=1,
|
|
744
|
+
)
|
|
745
|
+
|
|
746
|
+
error_msg = "scoring must return a number, got %s (%s) instead. (scorer=%s)"
|
|
747
|
+
if isinstance(scores, dict):
|
|
748
|
+
for name, score in scores.items():
|
|
749
|
+
if hasattr(score, "item"):
|
|
750
|
+
with suppress(ValueError):
|
|
751
|
+
# e.g. unwrap memmapped scalars
|
|
752
|
+
score = score.item()
|
|
753
|
+
if not isinstance(score, numbers.Number):
|
|
754
|
+
raise ValueError(error_msg % (score, type(score), name))
|
|
755
|
+
scores[name] = score
|
|
756
|
+
else: # scalar
|
|
757
|
+
if hasattr(scores, "item"):
|
|
758
|
+
with suppress(ValueError):
|
|
759
|
+
# e.g. unwrap memmapped scalars
|
|
760
|
+
scores = scores.item()
|
|
761
|
+
if not isinstance(scores, numbers.Number):
|
|
762
|
+
raise ValueError(error_msg % (scores, type(scores), scorer))
|
|
763
|
+
return scores
|
|
764
|
+
|
|
765
|
+
|
|
766
|
+
def _safe_split(estimator, X, y, indices, train_indices=None):
|
|
767
|
+
"""Create subset of dataset and properly handle kernels.
|
|
768
|
+
|
|
769
|
+
Slice X, y according to indices for cross-validation, but take care of
|
|
770
|
+
precomputed kernel-matrices or pairwise affinities / distances.
|
|
771
|
+
|
|
772
|
+
If ``estimator._pairwise is True``, X needs to be square and
|
|
773
|
+
we slice rows and columns. If ``train_indices`` is not None,
|
|
774
|
+
we slice rows using ``indices`` (assumed the test set) and columns
|
|
775
|
+
using ``train_indices``, indicating the training set.
|
|
776
|
+
|
|
777
|
+
.. deprecated:: 0.24
|
|
778
|
+
|
|
779
|
+
The _pairwise attribute is deprecated in 0.24. From 1.1
|
|
780
|
+
(renaming of 0.26) and onward, this function will check for the
|
|
781
|
+
pairwise estimator tag.
|
|
782
|
+
|
|
783
|
+
Labels y will always be indexed only along the first axis.
|
|
784
|
+
|
|
785
|
+
Parameters
|
|
786
|
+
----------
|
|
787
|
+
estimator : object
|
|
788
|
+
Estimator to determine whether we should slice only rows or rows and
|
|
789
|
+
columns.
|
|
790
|
+
|
|
791
|
+
X : array-like, sparse matrix or iterable
|
|
792
|
+
Data to be indexed. If ``estimator._pairwise is True``,
|
|
793
|
+
this needs to be a square array-like or sparse matrix.
|
|
794
|
+
|
|
795
|
+
y : array-like, sparse matrix or iterable
|
|
796
|
+
Targets to be indexed.
|
|
797
|
+
|
|
798
|
+
indices : array of int
|
|
799
|
+
Rows to select from X and y.
|
|
800
|
+
If ``estimator._pairwise is True`` and ``train_indices is None``
|
|
801
|
+
then ``indices`` will also be used to slice columns.
|
|
802
|
+
|
|
803
|
+
train_indices : array of int or None, default=None
|
|
804
|
+
If ``estimator._pairwise is True`` and ``train_indices is not None``,
|
|
805
|
+
then ``train_indices`` will be use to slice the columns of X.
|
|
806
|
+
|
|
807
|
+
Returns
|
|
808
|
+
-------
|
|
809
|
+
X_subset : array-like, sparse matrix or list
|
|
810
|
+
Indexed data.
|
|
811
|
+
|
|
812
|
+
y_subset : array-like, sparse matrix or list
|
|
813
|
+
Indexed targets.
|
|
814
|
+
|
|
815
|
+
"""
|
|
816
|
+
if _is_pairwise(estimator):
|
|
817
|
+
if not hasattr(X, "shape"):
|
|
818
|
+
raise ValueError(
|
|
819
|
+
"Precomputed kernels or affinity matrices have " "to be passed as arrays or sparse matrices."
|
|
820
|
+
)
|
|
821
|
+
# X is a precomputed square kernel matrix
|
|
822
|
+
if X.shape[0] != X.shape[1]:
|
|
823
|
+
raise ValueError("X should be a square kernel matrix")
|
|
824
|
+
if train_indices is None:
|
|
825
|
+
X_subset = X[np.ix_(indices, indices)]
|
|
826
|
+
else:
|
|
827
|
+
X_subset = X[np.ix_(indices, train_indices)]
|
|
828
|
+
else:
|
|
829
|
+
X_subset = _safe_indexing(X, indices)
|
|
830
|
+
|
|
831
|
+
if y is not None:
|
|
832
|
+
y_subset = _safe_indexing(y, indices)
|
|
833
|
+
else:
|
|
834
|
+
y_subset = None
|
|
835
|
+
|
|
836
|
+
return X_subset, y_subset
|
|
837
|
+
|
|
838
|
+
|
|
839
|
+
def _is_pairwise(estimator):
|
|
840
|
+
"""Returns True if estimator is pairwise.
|
|
841
|
+
|
|
842
|
+
- If the `_pairwise` attribute and the tag are present and consistent,
|
|
843
|
+
then use the value and not issue a warning.
|
|
844
|
+
- If the `_pairwise` attribute and the tag are present and not
|
|
845
|
+
consistent, use the `_pairwise` value and issue a deprecation
|
|
846
|
+
warning.
|
|
847
|
+
- If only the `_pairwise` attribute is present and it is not False,
|
|
848
|
+
issue a deprecation warning and use the `_pairwise` value.
|
|
849
|
+
|
|
850
|
+
Parameters
|
|
851
|
+
----------
|
|
852
|
+
estimator : object
|
|
853
|
+
Estimator object to test.
|
|
854
|
+
|
|
855
|
+
Returns
|
|
856
|
+
-------
|
|
857
|
+
out : bool
|
|
858
|
+
True if the estimator is pairwise and False otherwise.
|
|
859
|
+
"""
|
|
860
|
+
with warnings.catch_warnings():
|
|
861
|
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
862
|
+
has_pairwise_attribute = hasattr(estimator, "_pairwise")
|
|
863
|
+
pairwise_attribute = getattr(estimator, "_pairwise", False)
|
|
864
|
+
pairwise_tag = _safe_tags(estimator, key="pairwise")
|
|
865
|
+
|
|
866
|
+
if has_pairwise_attribute:
|
|
867
|
+
if pairwise_attribute != pairwise_tag:
|
|
868
|
+
warnings.warn(
|
|
869
|
+
"_pairwise was deprecated in 0.24 and will be removed in 1.1 "
|
|
870
|
+
"(renaming of 0.26). Set the estimator tags of your estimator "
|
|
871
|
+
"instead",
|
|
872
|
+
FutureWarning,
|
|
873
|
+
stacklevel=1,
|
|
874
|
+
)
|
|
875
|
+
return pairwise_attribute
|
|
876
|
+
|
|
877
|
+
# use pairwise tag when the attribute is not present
|
|
878
|
+
return pairwise_tag
|
|
879
|
+
|
|
880
|
+
|
|
881
|
+
def _safe_tags(estimator, key=None):
|
|
882
|
+
"""Safely get estimator tags.
|
|
883
|
+
|
|
884
|
+
:class:`~sklearn.BaseEstimator` provides the estimator tags machinery.
|
|
885
|
+
However, if an estimator does not inherit from this base class, we should
|
|
886
|
+
fall-back to the default tags.
|
|
887
|
+
|
|
888
|
+
For scikit-learn built-in estimators, we should still rely on
|
|
889
|
+
`self._get_tags()`. `_safe_tags(est)` should be used when we are not sure
|
|
890
|
+
where `est` comes from: typically `_safe_tags(self.base_estimator)` where
|
|
891
|
+
`self` is a meta-estimator, or in the common checks.
|
|
892
|
+
|
|
893
|
+
Parameters
|
|
894
|
+
----------
|
|
895
|
+
estimator : estimator object
|
|
896
|
+
The estimator from which to get the tag.
|
|
897
|
+
|
|
898
|
+
key : str, default=None
|
|
899
|
+
Tag name to get. By default (`None`), all tags are returned.
|
|
900
|
+
|
|
901
|
+
Returns
|
|
902
|
+
-------
|
|
903
|
+
tags : dict or tag value
|
|
904
|
+
The estimator tags. A single value is returned if `key` is not None.
|
|
905
|
+
"""
|
|
906
|
+
if hasattr(estimator, "_get_tags"):
|
|
907
|
+
tags_provider = "_get_tags()"
|
|
908
|
+
tags = estimator._get_tags()
|
|
909
|
+
elif hasattr(estimator, "_more_tags"):
|
|
910
|
+
tags_provider = "_more_tags()"
|
|
911
|
+
tags = {**_DEFAULT_TAGS, **estimator._more_tags()}
|
|
912
|
+
else:
|
|
913
|
+
tags_provider = "_DEFAULT_TAGS"
|
|
914
|
+
tags = _DEFAULT_TAGS
|
|
915
|
+
|
|
916
|
+
if key is not None:
|
|
917
|
+
if key not in tags:
|
|
918
|
+
raise ValueError(
|
|
919
|
+
f"The key {key} is not defined in {tags_provider} for the " f"class {estimator.__class__.__name__}."
|
|
920
|
+
)
|
|
921
|
+
return tags[key]
|
|
922
|
+
return tags
|
|
923
|
+
|
|
924
|
+
|
|
925
|
+
def _safe_indexing(X, indices, *, axis=0):
|
|
926
|
+
"""Return rows, items or columns of X using indices.
|
|
927
|
+
|
|
928
|
+
.. warning::
|
|
929
|
+
|
|
930
|
+
This utility is documented, but **private**. This means that
|
|
931
|
+
backward compatibility might be broken without any deprecation
|
|
932
|
+
cycle.
|
|
933
|
+
|
|
934
|
+
Parameters
|
|
935
|
+
----------
|
|
936
|
+
X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series
|
|
937
|
+
Data from which to sample rows, items or columns. `list` are only
|
|
938
|
+
supported when `axis=0`.
|
|
939
|
+
indices : bool, int, str, slice, array-like
|
|
940
|
+
- If `axis=0`, boolean and integer array-like, integer slice,
|
|
941
|
+
and scalar integer are supported.
|
|
942
|
+
- If `axis=1`:
|
|
943
|
+
- to select a single column, `indices` can be of `int` type for
|
|
944
|
+
all `X` types and `str` only for dataframe. The selected subset
|
|
945
|
+
will be 1D, unless `X` is a sparse matrix in which case it will
|
|
946
|
+
be 2D.
|
|
947
|
+
- to select multiples columns, `indices` can be one of the
|
|
948
|
+
following: `list`, `array`, `slice`. The type used in
|
|
949
|
+
these containers can be one of the following: `int`, 'bool' and
|
|
950
|
+
`str`. However, `str` is only supported when `X` is a dataframe.
|
|
951
|
+
The selected subset will be 2D.
|
|
952
|
+
axis : int, default=0
|
|
953
|
+
The axis along which `X` will be subsampled. `axis=0` will select
|
|
954
|
+
rows while `axis=1` will select columns.
|
|
955
|
+
|
|
956
|
+
Returns
|
|
957
|
+
-------
|
|
958
|
+
subset
|
|
959
|
+
Subset of X on axis 0 or 1.
|
|
960
|
+
|
|
961
|
+
Notes
|
|
962
|
+
-----
|
|
963
|
+
CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are
|
|
964
|
+
not supported.
|
|
965
|
+
"""
|
|
966
|
+
if indices is None:
|
|
967
|
+
return X
|
|
968
|
+
|
|
969
|
+
if axis not in (0, 1):
|
|
970
|
+
raise ValueError(
|
|
971
|
+
"'axis' should be either 0 (to index rows) or 1 (to index " " column). Got {} instead.".format(axis)
|
|
972
|
+
)
|
|
973
|
+
|
|
974
|
+
indices_dtype = _determine_key_type(indices)
|
|
975
|
+
|
|
976
|
+
if axis == 0 and indices_dtype == "str":
|
|
977
|
+
raise ValueError("String indexing is not supported with 'axis=0'")
|
|
978
|
+
|
|
979
|
+
if axis == 1 and X.ndim != 2:
|
|
980
|
+
raise ValueError(
|
|
981
|
+
"'X' should be a 2D NumPy array, 2D sparse matrix or pandas "
|
|
982
|
+
"dataframe when indexing the columns (i.e. 'axis=1'). "
|
|
983
|
+
"Got {} instead with {} dimension(s).".format(type(X), X.ndim)
|
|
984
|
+
)
|
|
985
|
+
|
|
986
|
+
if axis == 1 and indices_dtype == "str" and not hasattr(X, "loc"):
|
|
987
|
+
raise ValueError("Specifying the columns using strings is only supported for " "pandas DataFrames")
|
|
988
|
+
|
|
989
|
+
if hasattr(X, "iloc"):
|
|
990
|
+
return _pandas_indexing(X, indices, indices_dtype, axis=axis)
|
|
991
|
+
elif hasattr(X, "shape"):
|
|
992
|
+
return _array_indexing(X, indices, indices_dtype, axis=axis)
|
|
993
|
+
else:
|
|
994
|
+
return _list_indexing(X, indices, indices_dtype)
|
|
995
|
+
|
|
996
|
+
|
|
997
|
+
def _array_indexing(array, key, key_dtype, axis):
|
|
998
|
+
"""Index an array or scipy.sparse consistently across NumPy version."""
|
|
999
|
+
if np_version < parse_version("1.12") or issparse(array):
|
|
1000
|
+
# FIXME: Remove the check for NumPy when using >= 1.12
|
|
1001
|
+
# check if we have an boolean array-likes to make the proper indexing
|
|
1002
|
+
if key_dtype == "bool":
|
|
1003
|
+
key = np.asarray(key)
|
|
1004
|
+
if isinstance(key, tuple):
|
|
1005
|
+
key = list(key)
|
|
1006
|
+
return array[key] if axis == 0 else array[:, key]
|
|
1007
|
+
|
|
1008
|
+
|
|
1009
|
+
def _pandas_indexing(X, key, key_dtype, axis):
|
|
1010
|
+
"""Index a pandas dataframe or a series."""
|
|
1011
|
+
if hasattr(key, "shape"):
|
|
1012
|
+
# Work-around for indexing with read-only key in pandas
|
|
1013
|
+
# FIXME: solved in pandas 0.25
|
|
1014
|
+
key = np.asarray(key)
|
|
1015
|
+
key = key if key.flags.writeable else key.copy()
|
|
1016
|
+
elif isinstance(key, tuple):
|
|
1017
|
+
key = list(key)
|
|
1018
|
+
|
|
1019
|
+
if key_dtype == "int" and not (isinstance(key, slice) or np.isscalar(key)):
|
|
1020
|
+
# using take() instead of iloc[] ensures the return value is a "proper"
|
|
1021
|
+
# copy that will not raise SettingWithCopyWarning
|
|
1022
|
+
return X.take(key, axis=axis)
|
|
1023
|
+
else:
|
|
1024
|
+
# check whether we should index with loc or iloc
|
|
1025
|
+
indexer = X.iloc if key_dtype == "int" else X.loc
|
|
1026
|
+
return indexer[:, key] if axis else indexer[key]
|
|
1027
|
+
|
|
1028
|
+
|
|
1029
|
+
def _list_indexing(X, key, key_dtype):
|
|
1030
|
+
"""Index a Python list."""
|
|
1031
|
+
if np.isscalar(key) or isinstance(key, slice):
|
|
1032
|
+
# key is a slice or a scalar
|
|
1033
|
+
return X[key]
|
|
1034
|
+
if key_dtype == "bool":
|
|
1035
|
+
# key is a boolean array-like
|
|
1036
|
+
return list(compress(X, key))
|
|
1037
|
+
# key is a integer array-like of key
|
|
1038
|
+
return [X[idx] for idx in key]
|
|
1039
|
+
|
|
1040
|
+
|
|
1041
|
+
def _determine_key_type(key, accept_slice=True):
|
|
1042
|
+
"""Determine the data type of key.
|
|
1043
|
+
|
|
1044
|
+
Parameters
|
|
1045
|
+
----------
|
|
1046
|
+
key : scalar, slice or array-like
|
|
1047
|
+
The key from which we want to infer the data type.
|
|
1048
|
+
|
|
1049
|
+
accept_slice : bool, default=True
|
|
1050
|
+
Whether or not to raise an error if the key is a slice.
|
|
1051
|
+
|
|
1052
|
+
Returns
|
|
1053
|
+
-------
|
|
1054
|
+
dtype : {'int', 'str', 'bool', None}
|
|
1055
|
+
Returns the data type of key.
|
|
1056
|
+
"""
|
|
1057
|
+
err_msg = (
|
|
1058
|
+
"No valid specification of the columns. Only a scalar, list or "
|
|
1059
|
+
"slice of all integers or all strings, or boolean mask is "
|
|
1060
|
+
"allowed"
|
|
1061
|
+
)
|
|
1062
|
+
|
|
1063
|
+
dtype_to_str = {int: "int", str: "str", bool: "bool", np.bool_: "bool"}
|
|
1064
|
+
array_dtype_to_str = {
|
|
1065
|
+
"i": "int",
|
|
1066
|
+
"u": "int",
|
|
1067
|
+
"b": "bool",
|
|
1068
|
+
"O": "str",
|
|
1069
|
+
"U": "str",
|
|
1070
|
+
"S": "str",
|
|
1071
|
+
}
|
|
1072
|
+
|
|
1073
|
+
if key is None:
|
|
1074
|
+
return None
|
|
1075
|
+
if isinstance(key, tuple(dtype_to_str.keys())):
|
|
1076
|
+
try:
|
|
1077
|
+
return dtype_to_str[type(key)]
|
|
1078
|
+
except KeyError:
|
|
1079
|
+
raise ValueError(err_msg)
|
|
1080
|
+
if isinstance(key, slice):
|
|
1081
|
+
if not accept_slice:
|
|
1082
|
+
raise TypeError("Only array-like or scalar are supported. A Python slice was given.")
|
|
1083
|
+
if key.start is None and key.stop is None:
|
|
1084
|
+
return None
|
|
1085
|
+
key_start_type = _determine_key_type(key.start)
|
|
1086
|
+
key_stop_type = _determine_key_type(key.stop)
|
|
1087
|
+
if key_start_type is not None and key_stop_type is not None:
|
|
1088
|
+
if key_start_type != key_stop_type:
|
|
1089
|
+
raise ValueError(err_msg)
|
|
1090
|
+
if key_start_type is not None:
|
|
1091
|
+
return key_start_type
|
|
1092
|
+
return key_stop_type
|
|
1093
|
+
if isinstance(key, (list, tuple)):
|
|
1094
|
+
unique_key = set(key)
|
|
1095
|
+
key_type = {_determine_key_type(elt) for elt in unique_key}
|
|
1096
|
+
if not key_type:
|
|
1097
|
+
return None
|
|
1098
|
+
if len(key_type) != 1:
|
|
1099
|
+
raise ValueError(err_msg)
|
|
1100
|
+
return key_type.pop()
|
|
1101
|
+
if hasattr(key, "dtype"):
|
|
1102
|
+
try:
|
|
1103
|
+
return array_dtype_to_str[key.dtype.kind]
|
|
1104
|
+
except KeyError:
|
|
1105
|
+
raise ValueError(err_msg)
|
|
1106
|
+
raise ValueError(err_msg)
|
|
1107
|
+
|
|
1108
|
+
|
|
1109
|
+
# remove when https://github.com/joblib/joblib/issues/1071 is fixed
|
|
1110
|
+
def delayed(function):
|
|
1111
|
+
"""Decorator used to capture the arguments of a function."""
|
|
1112
|
+
|
|
1113
|
+
@functools.wraps(function)
|
|
1114
|
+
def delayed_function(*args, **kwargs):
|
|
1115
|
+
return _FuncWrapper(function), args, kwargs
|
|
1116
|
+
|
|
1117
|
+
return delayed_function
|
|
1118
|
+
|
|
1119
|
+
|
|
1120
|
+
class _FuncWrapper:
|
|
1121
|
+
""" "Load the global configuration before calling the function."""
|
|
1122
|
+
|
|
1123
|
+
def __init__(self, function):
|
|
1124
|
+
self.function = function
|
|
1125
|
+
self.config = get_config()
|
|
1126
|
+
functools.update_wrapper(self, self.function)
|
|
1127
|
+
|
|
1128
|
+
def __call__(self, *args, **kwargs):
|
|
1129
|
+
with config_context(**self.config):
|
|
1130
|
+
return self.function(*args, **kwargs)
|
|
1131
|
+
|
|
1132
|
+
|
|
1133
|
+
def _check_fit_params(X, fit_params, indices=None):
|
|
1134
|
+
"""Check and validate the parameters passed during `fit`.
|
|
1135
|
+
|
|
1136
|
+
Parameters
|
|
1137
|
+
----------
|
|
1138
|
+
X : array-like of shape (n_samples, n_features)
|
|
1139
|
+
Data array.
|
|
1140
|
+
|
|
1141
|
+
fit_params : dict
|
|
1142
|
+
Dictionary containing the parameters passed at fit.
|
|
1143
|
+
|
|
1144
|
+
indices : array-like of shape (n_samples,), default=None
|
|
1145
|
+
Indices to be selected if the parameter has the same size as `X`.
|
|
1146
|
+
|
|
1147
|
+
Returns
|
|
1148
|
+
-------
|
|
1149
|
+
fit_params_validated : dict
|
|
1150
|
+
Validated parameters. We ensure that the values support indexing.
|
|
1151
|
+
"""
|
|
1152
|
+
|
|
1153
|
+
fit_params_validated = {}
|
|
1154
|
+
for param_key, param_value in fit_params.items():
|
|
1155
|
+
if not _is_arraylike(param_value) or _num_samples(param_value) != _num_samples(X):
|
|
1156
|
+
# Non-indexable pass-through (for now for backward-compatibility).
|
|
1157
|
+
# https://github.com/scikit-learn/scikit-learn/issues/15805
|
|
1158
|
+
fit_params_validated[param_key] = param_value
|
|
1159
|
+
else:
|
|
1160
|
+
# Any other fit_params should support indexing
|
|
1161
|
+
# (e.g. for cross-validation).
|
|
1162
|
+
fit_params_validated[param_key] = _make_indexable(param_value)
|
|
1163
|
+
fit_params_validated[param_key] = _safe_indexing(fit_params_validated[param_key], indices)
|
|
1164
|
+
|
|
1165
|
+
return fit_params_validated
|
|
1166
|
+
|
|
1167
|
+
|
|
1168
|
+
def _is_arraylike(x):
|
|
1169
|
+
"""Returns whether the input is array-like."""
|
|
1170
|
+
return hasattr(x, "__len__") or hasattr(x, "shape") or hasattr(x, "__array__")
|
|
1171
|
+
|
|
1172
|
+
|
|
1173
|
+
def _make_indexable(iterable):
|
|
1174
|
+
"""Ensure iterable supports indexing or convert to an indexable variant.
|
|
1175
|
+
|
|
1176
|
+
Convert sparse matrices to csr and other non-indexable iterable to arrays.
|
|
1177
|
+
Let `None` and indexable objects (e.g. pandas dataframes) pass unchanged.
|
|
1178
|
+
|
|
1179
|
+
Parameters
|
|
1180
|
+
----------
|
|
1181
|
+
iterable : {list, dataframe, ndarray, sparse matrix} or None
|
|
1182
|
+
Object to be converted to an indexable iterable.
|
|
1183
|
+
"""
|
|
1184
|
+
if sp.issparse(iterable):
|
|
1185
|
+
return iterable.tocsr()
|
|
1186
|
+
elif hasattr(iterable, "__getitem__") or hasattr(iterable, "iloc"):
|
|
1187
|
+
return iterable
|
|
1188
|
+
elif iterable is None:
|
|
1189
|
+
return iterable
|
|
1190
|
+
return np.array(iterable)
|
|
1191
|
+
|
|
1192
|
+
|
|
1193
|
+
def _num_samples(x):
|
|
1194
|
+
"""Return number of samples in array-like x."""
|
|
1195
|
+
message = "Expected sequence or array-like, got %s" % type(x)
|
|
1196
|
+
if hasattr(x, "fit") and callable(x.fit):
|
|
1197
|
+
# Don't get num_samples from an ensembles length!
|
|
1198
|
+
raise TypeError(message)
|
|
1199
|
+
|
|
1200
|
+
if not hasattr(x, "__len__") and not hasattr(x, "shape"):
|
|
1201
|
+
if hasattr(x, "__array__"):
|
|
1202
|
+
x = np.asarray(x)
|
|
1203
|
+
else:
|
|
1204
|
+
raise TypeError(message)
|
|
1205
|
+
|
|
1206
|
+
if hasattr(x, "shape") and x.shape is not None:
|
|
1207
|
+
if len(x.shape) == 0:
|
|
1208
|
+
raise TypeError("Singleton array %r cannot be considered a valid collection." % x)
|
|
1209
|
+
# Check that shape is returning an integer or default to len
|
|
1210
|
+
# Dask dataframes may not return numeric shape[0] value
|
|
1211
|
+
if isinstance(x.shape[0], numbers.Integral):
|
|
1212
|
+
return x.shape[0]
|
|
1213
|
+
|
|
1214
|
+
try:
|
|
1215
|
+
return len(x)
|
|
1216
|
+
except TypeError as type_error:
|
|
1217
|
+
raise TypeError(message) from type_error
|