fklearn 2.2.0__tar.gz → 2.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. fklearn-2.3.0/PKG-INFO +68 -0
  2. {fklearn-2.2.0 → fklearn-2.3.0}/README.md +2 -3
  3. {fklearn-2.2.0 → fklearn-2.3.0}/requirements.txt +1 -1
  4. {fklearn-2.2.0 → fklearn-2.3.0}/requirements_test.txt +1 -0
  5. {fklearn-2.2.0 → fklearn-2.3.0}/setup.py +12 -3
  6. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/causal/cate_learning/meta_learners.py +187 -7
  7. fklearn-2.3.0/src/fklearn/exceptions/exceptions.py +31 -0
  8. fklearn-2.3.0/src/fklearn/resources/VERSION +1 -0
  9. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/training/classification.py +67 -8
  10. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/training/transformation.py +1 -1
  11. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/training/unsupervised.py +7 -3
  12. fklearn-2.3.0/src/fklearn/validation/__init__.py +0 -0
  13. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/validation/evaluators.py +55 -11
  14. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/validation/splitters.py +3 -3
  15. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/validation/validator.py +47 -11
  16. fklearn-2.3.0/src/fklearn.egg-info/PKG-INFO +68 -0
  17. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn.egg-info/SOURCES.txt +2 -0
  18. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn.egg-info/requires.txt +3 -1
  19. fklearn-2.2.0/PKG-INFO +0 -22
  20. fklearn-2.2.0/src/fklearn/resources/VERSION +0 -1
  21. fklearn-2.2.0/src/fklearn.egg-info/PKG-INFO +0 -22
  22. {fklearn-2.2.0 → fklearn-2.3.0}/LICENSE +0 -0
  23. {fklearn-2.2.0 → fklearn-2.3.0}/MANIFEST.in +0 -0
  24. {fklearn-2.2.0 → fklearn-2.3.0}/requirements_catboost.txt +0 -0
  25. {fklearn-2.2.0 → fklearn-2.3.0}/requirements_demos.txt +0 -0
  26. {fklearn-2.2.0 → fklearn-2.3.0}/requirements_lgbm.txt +0 -0
  27. {fklearn-2.2.0 → fklearn-2.3.0}/requirements_tools.txt +0 -0
  28. {fklearn-2.2.0 → fklearn-2.3.0}/requirements_xgboost.txt +0 -0
  29. {fklearn-2.2.0 → fklearn-2.3.0}/setup.cfg +0 -0
  30. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/__init__.py +0 -0
  31. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/causal/__init__.py +0 -0
  32. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/causal/cate_learning/__init__.py +0 -0
  33. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/causal/cate_learning/double_machine_learning.py +0 -0
  34. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/causal/debias.py +0 -0
  35. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/causal/effects.py +0 -0
  36. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/causal/validation/__init__.py +0 -0
  37. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/causal/validation/auc.py +0 -0
  38. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/causal/validation/cate.py +0 -0
  39. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/causal/validation/curves.py +0 -0
  40. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/common_docstrings.py +0 -0
  41. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/data/__init__.py +0 -0
  42. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/data/datasets.py +0 -0
  43. {fklearn-2.2.0/src/fklearn/metrics → fklearn-2.3.0/src/fklearn/exceptions}/__init__.py +0 -0
  44. {fklearn-2.2.0/src/fklearn/preprocessing → fklearn-2.3.0/src/fklearn/metrics}/__init__.py +0 -0
  45. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/metrics/pd_extractors.py +0 -0
  46. {fklearn-2.2.0/src/fklearn/training → fklearn-2.3.0/src/fklearn/preprocessing}/__init__.py +0 -0
  47. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/preprocessing/rebalancing.py +0 -0
  48. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/preprocessing/schema.py +0 -0
  49. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/preprocessing/splitting.py +0 -0
  50. {fklearn-2.2.0/src/fklearn/tuning → fklearn-2.3.0/src/fklearn/training}/__init__.py +0 -0
  51. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/training/calibration.py +0 -0
  52. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/training/ensemble.py +0 -0
  53. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/training/imputation.py +0 -0
  54. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/training/pipeline.py +0 -0
  55. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/training/regression.py +0 -0
  56. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/training/utils.py +0 -0
  57. {fklearn-2.2.0/src/fklearn/validation → fklearn-2.3.0/src/fklearn/tuning}/__init__.py +0 -0
  58. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/tuning/model_agnostic_fc.py +0 -0
  59. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/tuning/parameter_tuners.py +0 -0
  60. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/tuning/samplers.py +0 -0
  61. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/tuning/selectors.py +0 -0
  62. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/tuning/stoppers.py +0 -0
  63. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/tuning/utils.py +0 -0
  64. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/types/__init__.py +0 -0
  65. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/types/types.py +0 -0
  66. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/validation/perturbators.py +0 -0
  67. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/version.py +0 -0
  68. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn.egg-info/dependency_links.txt +0 -0
  69. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn.egg-info/not-zip-safe +0 -0
  70. {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn.egg-info/top_level.txt +0 -0
fklearn-2.3.0/PKG-INFO ADDED
@@ -0,0 +1,68 @@
1
+ Metadata-Version: 2.1
2
+ Name: fklearn
3
+ Version: 2.3.0
4
+ Summary: Functional machine learning
5
+ Home-page: https://github.com/nubank/fklearn
6
+ Author: Nubank
7
+ Classifier: Programming Language :: Python :: 3.6
8
+ Classifier: Programming Language :: Python :: 3.7
9
+ Classifier: Programming Language :: Python :: 3.8
10
+ Classifier: Programming Language :: Python :: 3.9
11
+ Requires-Python: >=3.6.2,<3.10
12
+ Description-Content-Type: text/markdown
13
+ Provides-Extra: test_deps
14
+ Provides-Extra: lgbm
15
+ Provides-Extra: xgboost
16
+ Provides-Extra: catboost
17
+ Provides-Extra: tools
18
+ Provides-Extra: devel
19
+ Provides-Extra: all_models
20
+ Provides-Extra: all
21
+ License-File: LICENSE
22
+
23
+ # fklearn: Functional Machine Learning
24
+
25
+ ![PyPI](https://img.shields.io/pypi/v/fklearn.svg?style=flat-square)
26
+ [![Documentation Status](https://readthedocs.org/projects/fklearn/badge/?version=latest)](https://fklearn.readthedocs.io/en/latest/?badge=latest)
27
+ [![Gitter](https://badges.gitter.im/fklearn-python/community.svg)](https://gitter.im/fklearn-python/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge)
28
+ ![Tests](https://github.com/nubank/fklearn/actions/workflows/push.yaml/badge.svg?branch=master)
29
+ [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
30
+
31
+ **fklearn** uses functional programming principles to make it easier to solve real problems with Machine Learning.
32
+
33
+ The name is a reference to the widely known [scikit-learn](https://scikit-learn.org/stable/) library.
34
+
35
+ **fklearn Principles**
36
+
37
+ 1. Validation should reflect real-life situations.
38
+ 2. Production models should match validated models.
39
+ 3. Models should be production-ready with few extra steps.
40
+ 4. Reproducibility and in-depth analysis of model results should be easy to achieve.
41
+
42
+
43
+ [Documentation](https://fklearn.readthedocs.io/en/latest/) |
44
+ [Getting Started](https://fklearn.readthedocs.io/en/latest/getting_started.html) |
45
+ [API Docs](https://fklearn.readthedocs.io/en/latest/api/modules.html) |
46
+ [Contributing](https://fklearn.readthedocs.io/en/latest/contributing.html) |
47
+
48
+
49
+ ## Installation
50
+
51
+ To install via pip:
52
+
53
+ ```
54
+ pip install fklearn
55
+ ```
56
+
57
+ You can also install from the source:
58
+
59
+ ```sh
60
+ git clone git@github.com:nubank/fklearn.git
61
+ cd fklearn
62
+ git checkout master
63
+ pip install -e .
64
+ ```
65
+
66
+ ## License
67
+
68
+ [Apache License 2.0](LICENSE)
@@ -3,8 +3,7 @@
3
3
  ![PyPI](https://img.shields.io/pypi/v/fklearn.svg?style=flat-square)
4
4
  [![Documentation Status](https://readthedocs.org/projects/fklearn/badge/?version=latest)](https://fklearn.readthedocs.io/en/latest/?badge=latest)
5
5
  [![Gitter](https://badges.gitter.im/fklearn-python/community.svg)](https://gitter.im/fklearn-python/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge)
6
- [![CircleCI](https://circleci.com/gh/nubank/fklearn.svg?style=svg)](https://circleci.com/gh/nubank/fklearn)
7
- [![codecov.io](https://codecov.io/github/nubank/fklearn/branch/master/graph/badge.svg)](https://codecov.io/github/nubank/fklearn)
6
+ ![Tests](https://github.com/nubank/fklearn/actions/workflows/push.yaml/badge.svg?branch=master)
8
7
  [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
9
8
 
10
9
  **fklearn** uses functional programming principles to make it easier to solve real problems with Machine Learning.
@@ -21,7 +20,7 @@ The name is a reference to the widely known [scikit-learn](https://scikit-learn.
21
20
 
22
21
  [Documentation](https://fklearn.readthedocs.io/en/latest/) |
23
22
  [Getting Started](https://fklearn.readthedocs.io/en/latest/getting_started.html) |
24
- [API Docs](https://fklearn.readthedocs.io/en/latest/api.html) |
23
+ [API Docs](https://fklearn.readthedocs.io/en/latest/api/modules.html) |
25
24
  [Contributing](https://fklearn.readthedocs.io/en/latest/contributing.html) |
26
25
 
27
26
 
@@ -1,6 +1,6 @@
1
1
  joblib>=0.13.2,<2
2
2
  numpy>=1.16.4,<2
3
3
  pandas>=0.24.1,<2
4
- scikit-learn>=0.21.2,<0.24.0
4
+ scikit-learn>=0.21.2,<0.25.0
5
5
  statsmodels>=0.9.0,<1
6
6
  toolz>=0.9.0,<1
@@ -2,5 +2,6 @@ pytest>=4.2.1,<7
2
2
  pytest-cov>=2.6.1,<3
3
3
  pytest-xdist>=1.26.1,<3
4
4
  mypy>=0.670,<1
5
+ coverage<5
5
6
  codecov>=2.0,<3
6
7
  hypothesis>=5.5.4,<7
@@ -26,10 +26,15 @@ all_models_deps = lgbm_deps + xgboost_deps + catboost_deps
26
26
  all_deps = all_models_deps + tools_deps
27
27
  devel_deps = test_deps + all_deps
28
28
 
29
+ with open("README.md", "r") as fh:
30
+ long_description = fh.read()
31
+
29
32
  setup(name=MODULE_NAME,
30
33
  description="Functional machine learning",
34
+ long_description=long_description,
35
+ long_description_content_type="text/markdown",
31
36
  url='https://github.com/nubank/{:s}'.format(REPO_NAME),
32
- python_requires='>=3.6.2',
37
+ python_requires='>=3.6.2,<3.10',
33
38
  author="Nubank",
34
39
  package_dir={'': 'src'},
35
40
  packages=find_packages('src'),
@@ -46,5 +51,9 @@ setup(name=MODULE_NAME,
46
51
  "all": all_deps},
47
52
  include_package_data=True,
48
53
  zip_safe=False,
49
- classifiers=['Programming Language :: Python :: 3.6'])
50
-
54
+ classifiers=[
55
+ 'Programming Language :: Python :: 3.6',
56
+ 'Programming Language :: Python :: 3.7',
57
+ 'Programming Language :: Python :: 3.8',
58
+ 'Programming Language :: Python :: 3.9'
59
+ ])
@@ -1,6 +1,6 @@
1
1
  import copy
2
2
  import inspect
3
- from typing import Callable, List, Tuple
3
+ from typing import Callable, Dict, List, Tuple
4
4
 
5
5
  import numpy as np
6
6
  import pandas as pd
@@ -194,25 +194,19 @@ def causal_s_classification_learner(
194
194
 
195
195
  Parameters
196
196
  ----------
197
-
198
197
  df : pd.DataFrame
199
198
  A Pandas' DataFrame with features and target columns.
200
199
  The model will be trained to predict the target column
201
200
  from the features.
202
-
203
201
  treatment_col: str
204
202
  The name of the column in `df` which contains the names of
205
203
  the treatments or control to which each data sample was subjected.
206
-
207
204
  control_name: str
208
205
  The name of the control group.
209
-
210
206
  prediction_column : str
211
207
  The name of the column with the predictions from the provided learner.
212
-
213
208
  learner: Callable
214
209
  A fklearn classification learner function.
215
-
216
210
  learner_transformers: list
217
211
  A list of fklearn transformer functions to be applied after the learner and before estimating the CATE.
218
212
  This parameter may be useful, for example, to estimate the CATE with calibrated classifiers.
@@ -266,3 +260,189 @@ def causal_s_classification_learner(
266
260
  causal_s_classification_learner.__doc__ += learner_return_docstring(
267
261
  "Causal S-Learner Classifier"
268
262
  )
263
+
264
+
265
+ def _simulate_t_learner_treatment_effect(
266
+ df: pd.DataFrame,
267
+ learners: dict,
268
+ treatments: list,
269
+ control_name: str,
270
+ prediction_column: str,
271
+ ) -> pd.DataFrame:
272
+ control_fcn = learners[control_name]
273
+ control_conversion_probability = control_fcn(df)[prediction_column].values
274
+
275
+ scored_df = df.copy()
276
+
277
+ uplift_cols = []
278
+ for treatment_name in treatments:
279
+ treatment_fcn = learners[treatment_name]
280
+ treatment_conversion_probability = treatment_fcn(df)[prediction_column].values
281
+
282
+ scored_df[
283
+ f"treatment_{treatment_name}__{prediction_column}_on_treatment"
284
+ ] = treatment_conversion_probability
285
+
286
+ uplift_cols.append(f"treatment_{treatment_name}__uplift")
287
+ scored_df[uplift_cols[-1]] = (
288
+ treatment_conversion_probability - control_conversion_probability
289
+ )
290
+
291
+ scored_df["uplift"] = scored_df[uplift_cols].max(axis=1).values
292
+ scored_df["suggested_treatment"] = np.where(
293
+ scored_df["uplift"].values <= 0,
294
+ control_name,
295
+ scored_df[uplift_cols].idxmax(axis=1).values,
296
+ )
297
+ scored_df["suggested_treatment"] = (
298
+ scored_df["suggested_treatment"]
299
+ .apply(lambda x: x.replace("__uplift", ""))
300
+ .values
301
+ )
302
+
303
+ return scored_df
304
+
305
+
306
+ def _get_model_fcn(
307
+ df: pd.DataFrame,
308
+ treatment_col: str,
309
+ treatment_name: str,
310
+ learner: Callable,
311
+ ) -> Tuple[Callable, dict, dict]:
312
+ """
313
+ Returns a function that predicts the target column from the features.
314
+ """
315
+
316
+ treatment_names = df[treatment_col].unique()
317
+
318
+ if treatment_name not in treatment_names:
319
+ raise MissingTreatmentError()
320
+
321
+ df = df.loc[df[treatment_col] == treatment_name].reset_index(drop=True).copy()
322
+
323
+ return learner(df)
324
+
325
+
326
+ def _get_learners(
327
+ df: pd.DataFrame,
328
+ control_learner: Callable,
329
+ treatment_learner: Callable,
330
+ unique_treatments: List[str],
331
+ control_name: str,
332
+ treatment_col: str,
333
+ ) -> Tuple[Dict[str, Callable], Dict[str, dict]]:
334
+ learners: Dict[str, Callable] = {}
335
+ logs: Dict[str, dict] = {}
336
+
337
+ learner_fcn, _, learner_logs = _get_model_fcn(
338
+ df, treatment_col, control_name, control_learner
339
+ )
340
+ learners[control_name] = learner_fcn
341
+ logs[control_name] = learner_logs
342
+
343
+ for treatment_name in unique_treatments:
344
+ learner_fcn, _, learner_logs = _get_model_fcn(
345
+ df, treatment_col, treatment_name, treatment_learner
346
+ )
347
+ learners[treatment_name] = learner_fcn
348
+ logs[treatment_name] = learner_logs
349
+
350
+ return learners, logs
351
+
352
+
353
+ @curry
354
+ def causal_t_classification_learner(
355
+ df: pd.DataFrame,
356
+ treatment_col: str,
357
+ control_name: str,
358
+ prediction_column: str,
359
+ learner: LearnerFnType,
360
+ treatment_learner: LearnerFnType = None,
361
+ learner_transformers: List[LearnerFnType] = None,
362
+ ) -> LearnerReturnType:
363
+ """
364
+ Fits a Causal T-Learner classifier. The T-Learner is a meta-learner which learns the
365
+ Conditional Average Treatment Effect (CATE) through the use of one Machine Learning
366
+ model for each treatment and for the control group. Each model is fitted in a subset of
367
+ the data, according to the treatment: the CATE $\tau$ is defined as
368
+ $\tau(x_{i}) = M_{1}(X=x_{i}, T=1) - M_{0}(X=x_{i}, T=0)$, being $M_{1}$ a model fitted
369
+ with treatment data and $M_{0}$ a model fitted with control data. Notice that $M_{0}$
370
+ and $M_{1}$ are traditional Machine Learning models such as a LightGBM Classifier and
371
+ that $x_{i}$ is the feature set of sample $i$.
372
+
373
+ **References:**
374
+
375
+ [1] https://matheusfacure.github.io/python-causality-handbook/21-Meta-Learners.html
376
+
377
+ [2] https://causalml.readthedocs.io/en/latest/methodology.html
378
+
379
+ Parameters
380
+ ----------
381
+ df : pd.DataFrame
382
+ A Pandas' DataFrame with features and target columns.
383
+ The model will be trained to predict the target column
384
+ from the features.
385
+ treatment_col: str
386
+ The name of the column in `df` which contains the names of
387
+ the treatments and control to which each data sample was subjected.
388
+ control_name: str
389
+ The name of the control group.
390
+ prediction_column : str
391
+ The name of the column with the predictions from the provided learner.
392
+ learner: LearnerFnType
393
+ A fklearn classification learner function.
394
+ treatment_learner: LearnerFnType
395
+ An optional fklearn classification learner function.
396
+ learner_transformers: List[LearnerFnType]
397
+ A list of fklearn transformer functions to be applied after the learner and before estimating the CATE.
398
+ This parameter may be useful, for example, to estimate the CATE with calibrated classifiers.
399
+ """
400
+
401
+ control_learner = copy.deepcopy(learner)
402
+
403
+ if treatment_learner is None:
404
+ treatment_learner = copy.deepcopy(learner)
405
+
406
+ # pipeline
407
+ if learner_transformers is not None:
408
+ learner_transformers = copy.deepcopy(learner_transformers)
409
+ control_learner_pipe = build_pipeline(*[control_learner] + learner_transformers)
410
+
411
+ treatment_learner_pipe = build_pipeline(
412
+ *[treatment_learner] + learner_transformers
413
+ )
414
+ else:
415
+ control_learner_pipe = copy.deepcopy(control_learner)
416
+ treatment_learner_pipe = copy.deepcopy(treatment_learner)
417
+
418
+ # learners
419
+ unique_treatments = _get_unique_treatments(df, treatment_col, control_name)
420
+
421
+ learners, learners_logs = _get_learners(
422
+ df=df,
423
+ control_learner=control_learner_pipe,
424
+ treatment_learner=treatment_learner_pipe,
425
+ unique_treatments=unique_treatments,
426
+ control_name=control_name,
427
+ treatment_col=treatment_col,
428
+ )
429
+
430
+ def p(new_df: pd.DataFrame) -> pd.DataFrame:
431
+ return _simulate_t_learner_treatment_effect(
432
+ new_df,
433
+ learners,
434
+ unique_treatments,
435
+ control_name,
436
+ prediction_column,
437
+ )
438
+
439
+ p.__doc__ = learner_pred_fn_docstring("causal_t_classification_learner")
440
+
441
+ log = {"causal_t_classification_learner": {**learners_logs}}
442
+
443
+ return p, p(df), log
444
+
445
+
446
+ causal_t_classification_learner.__doc__ += learner_return_docstring(
447
+ "Causal T-Learner Classifier"
448
+ )
@@ -0,0 +1,31 @@
1
+ from typing import Any, Dict, List
2
+
3
+
4
+ class MultipleTreatmentsError(Exception):
5
+ def __init__(
6
+ self,
7
+ msg: str = "Data contains multiple treatments.",
8
+ *args: List[Any],
9
+ **kwargs: Dict[str, Any]
10
+ ) -> None:
11
+ super().__init__(msg, *args, **kwargs)
12
+
13
+
14
+ class MissingControlError(Exception):
15
+ def __init__(
16
+ self,
17
+ msg: str = "Data does not contain the specified control.",
18
+ *args: List[Any],
19
+ **kwargs: Dict[str, Any]
20
+ ) -> None:
21
+ super().__init__(msg, *args, **kwargs)
22
+
23
+
24
+ class MissingTreatmentError(Exception):
25
+ def __init__(
26
+ self,
27
+ msg: str = "Data does not contain the specified treatment.",
28
+ *args: List[Any],
29
+ **kwargs: Dict[str, Any]
30
+ ) -> None:
31
+ super().__init__(msg, *args, **kwargs)
@@ -0,0 +1 @@
1
+ 2.3.0
@@ -1,7 +1,9 @@
1
- from typing import List, Any
1
+ from typing import List, Any, Optional, Callable, Tuple, Union
2
2
 
3
3
  import numpy as np
4
4
  import pandas as pd
5
+ from lightgbm import Booster
6
+ from pathlib import Path
5
7
  from toolz import curry, merge, assoc
6
8
  from sklearn.feature_extraction.text import TfidfVectorizer
7
9
  from sklearn.linear_model import LogisticRegression
@@ -233,7 +235,7 @@ xgb_classification_learner.__doc__ += learner_return_docstring("XGboost Classifi
233
235
  @curry
234
236
  def _get_catboost_shap_values(df: pd.DataFrame, cbr: Any,
235
237
  features: List, target: str,
236
- weights: List, cat_features: List) -> np.array:
238
+ weights: List, cat_features: List) -> np.ndarray:
237
239
  """
238
240
  Auxiliar method to allow us to get shap values for Catboost multiclass models
239
241
 
@@ -446,7 +448,7 @@ def nlp_logistic_classification_learner(df: pd.DataFrame,
446
448
  """
447
449
 
448
450
  # set default params
449
- default_vect_params = {"strip_accents": "unicode", "min_df": 20}
451
+ default_vect_params = {"strip_accents": "unicode", "min_df": 1}
450
452
  merged_vect_params = default_vect_params if not vectorizer_params else merge(default_vect_params, vectorizer_params)
451
453
 
452
454
  default_clf_params = {"C": 0.1, "multi_class": "ovr", "solver": "liblinear"}
@@ -501,10 +503,24 @@ def lgbm_classification_learner(df: pd.DataFrame,
501
503
  target: str,
502
504
  learning_rate: float = 0.1,
503
505
  num_estimators: int = 100,
504
- extra_params: LogType = None,
506
+ extra_params: Optional[LogType] = None,
505
507
  prediction_column: str = "prediction",
506
- weight_column: str = None,
507
- encode_extra_cols: bool = True) -> LearnerReturnType:
508
+ weight_column: Optional[str] = None,
509
+ encode_extra_cols: bool = True,
510
+ valid_sets: Optional[List[pd.DataFrame]] = None,
511
+ valid_names: Optional[List[str]] = None,
512
+ feval: Optional[Union[
513
+ Callable[[np.ndarray, pd.DataFrame], Tuple[str, float, bool]],
514
+ List[Callable[[np.ndarray, pd.DataFrame], Tuple[str, float, bool]]]]
515
+ ] = None,
516
+ init_model: Optional[Union[str, Path, Booster]] = None,
517
+ feature_name: Union[List[str], str] = 'auto',
518
+ categorical_feature: Union[List[str], List[int], str] = 'auto',
519
+ keep_training_booster: bool = False,
520
+ callbacks: Optional[List[Callable]] = None,
521
+ dataset_init_score: Optional[Union[
522
+ List, List[List], np.ndarray, pd.Series, pd.DataFrame]
523
+ ] = None) -> LearnerReturnType:
508
524
  """
509
525
  Fits an LGBM classifier to the dataset.
510
526
 
@@ -557,6 +573,46 @@ def lgbm_classification_learner(df: pd.DataFrame,
557
573
 
558
574
  encode_extra_cols : bool (default: True)
559
575
  If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns.
576
+
577
+ valid_sets : list of pandas.DataFrame, optional (default=None)
578
+ A list of datasets to be used for early-stopping during training.
579
+
580
+ valid_names : list of strings, optional (default=None)
581
+ A list of dataset names matching the list of datasets provided through the ``valid_sets`` parameter.
582
+
583
+ feval : callable, list of callable, or None, optional (default=None)
584
+ Customized evaluation function. Each evaluation function should accept two parameters: preds, eval_data, and
585
+ return (eval_name, eval_result, is_higher_better) or list of such tuples.
586
+
587
+ init_model : str, pathlib.Path, Booster or None, optional (default=None)
588
+ Filename of LightGBM model or Booster instance used for continue training.
589
+
590
+ feature_name : list of str, or 'auto', optional (default="auto")
591
+ Feature names. If ‘auto’ and data is pandas DataFrame, data columns names are used.
592
+
593
+ categorical_feature : list of str or int, or 'auto', optional (default="auto")
594
+ Categorical features. If list of int, interpreted as indices. If list of str, interpreted as feature names (need
595
+ to specify feature_name as well). If ‘auto’ and data is pandas DataFrame, pandas unordered categorical columns
596
+ are used. All values in categorical features will be cast to int32 and thus should be less than int32 max value
597
+ (2147483647). Large values could be memory consuming. Consider using consecutive integers starting from zero.
598
+ All negative values in categorical features will be treated as missing values. The output cannot be
599
+ monotonically constrained with respect to a categorical feature. Floating point numbers in categorical features
600
+ will be rounded towards 0.
601
+
602
+ keep_training_booster : bool, optional (default=False)
603
+ Whether the returned Booster will be used to keep training. If False, the returned value will be converted into
604
+ _InnerPredictor before returning. This means you won’t be able to use eval, eval_train or eval_valid methods of
605
+ the returned Booster. When your model is very large and cause the memory error, you can try to set this param to
606
+ True to avoid the model conversion performed during the internal call of model_to_string. You can still use
607
+ _InnerPredictor as init_model for future continue training.
608
+
609
+ callbacks : list of callable, or None, optional (default=None)
610
+ List of callback functions that are applied at each iteration. See Callbacks in LightGBM Python API for more
611
+ information.
612
+
613
+ dataset_init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for
614
+ multi-class task), or None, optional (default=None)
615
+ Init score for Dataset. It could be the prediction of the majority class or a prediction from any other model.
560
616
  """
561
617
 
562
618
  import lightgbm as lgbm
@@ -570,9 +626,12 @@ def lgbm_classification_learner(df: pd.DataFrame,
570
626
  features = features if not encode_extra_cols else expand_features_encoded(df, features)
571
627
 
572
628
  dtrain = lgbm.Dataset(df[features].values, label=df[target], feature_name=list(map(str, features)), weight=weights,
573
- silent=True)
629
+ silent=True, init_score=dataset_init_score)
574
630
 
575
- bst = lgbm.train(params, dtrain, num_estimators)
631
+ bst = lgbm.train(params=params, train_set=dtrain, num_boost_round=num_estimators, valid_sets=valid_sets,
632
+ valid_names=valid_names, feval=feval, init_model=init_model, feature_name=feature_name,
633
+ categorical_feature=categorical_feature, keep_training_booster=keep_training_booster,
634
+ callbacks=callbacks)
576
635
 
577
636
  def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame:
578
637
  if params["objective"] == "multiclass":
@@ -1027,7 +1027,7 @@ def missing_warner(df: pd.DataFrame, cols_list: List[str],
1027
1027
  cols_without_missing = df_selected.loc[:, df_selected.isna().sum(axis=0) == 0].columns.tolist()
1028
1028
 
1029
1029
  def p(dataset: pd.DataFrame) -> pd.DataFrame:
1030
- def detailed_assignment(df: pd.DataFrame, cols_to_check: List[str]) -> np.array:
1030
+ def detailed_assignment(df: pd.DataFrame, cols_to_check: List[str]) -> np.ndarray:
1031
1031
  cols_with_missing = np.array([np.where(df[col].isna(), col, "") for col in cols_to_check]).T
1032
1032
  missing_by_row_list = np.array([list(filter(None, x)) for x in cols_with_missing]).reshape(-1, 1)
1033
1033
  if missing_by_row_list.size == 0:
@@ -42,13 +42,17 @@ def isolation_forest_learner(df: pd.DataFrame,
42
42
  If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns.
43
43
  """
44
44
 
45
- default_params = {"n_jobs": -1, "random_state": 1729, "contamination": 0.1, "behaviour": "new"}
45
+ model = IsolationForest()
46
+
47
+ default_params: Dict[str, Any] = {"n_jobs": -1, "random_state": 1729, "contamination": 0.1}
48
+ # Remove this when we stop supporting scikit-learn<0.24 as this param is deprecated
49
+ if "behaviour" in model.get_params():
50
+ default_params["behaviour"] = "new"
46
51
  params = default_params if not params else merge(default_params, params)
52
+ model.set_params(**params)
47
53
 
48
54
  features = features if not encode_extra_cols else expand_features_encoded(df, features)
49
55
 
50
- model = IsolationForest()
51
- model.set_params(**params)
52
56
  model.fit(df[features].values)
53
57
 
54
58
  def p(new_df: pd.DataFrame) -> pd.DataFrame:
File without changes