fklearn 2.2.0__tar.gz → 2.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fklearn-2.3.0/PKG-INFO +68 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/README.md +2 -3
- {fklearn-2.2.0 → fklearn-2.3.0}/requirements.txt +1 -1
- {fklearn-2.2.0 → fklearn-2.3.0}/requirements_test.txt +1 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/setup.py +12 -3
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/causal/cate_learning/meta_learners.py +187 -7
- fklearn-2.3.0/src/fklearn/exceptions/exceptions.py +31 -0
- fklearn-2.3.0/src/fklearn/resources/VERSION +1 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/training/classification.py +67 -8
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/training/transformation.py +1 -1
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/training/unsupervised.py +7 -3
- fklearn-2.3.0/src/fklearn/validation/__init__.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/validation/evaluators.py +55 -11
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/validation/splitters.py +3 -3
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/validation/validator.py +47 -11
- fklearn-2.3.0/src/fklearn.egg-info/PKG-INFO +68 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn.egg-info/SOURCES.txt +2 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn.egg-info/requires.txt +3 -1
- fklearn-2.2.0/PKG-INFO +0 -22
- fklearn-2.2.0/src/fklearn/resources/VERSION +0 -1
- fklearn-2.2.0/src/fklearn.egg-info/PKG-INFO +0 -22
- {fklearn-2.2.0 → fklearn-2.3.0}/LICENSE +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/MANIFEST.in +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/requirements_catboost.txt +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/requirements_demos.txt +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/requirements_lgbm.txt +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/requirements_tools.txt +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/requirements_xgboost.txt +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/setup.cfg +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/__init__.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/causal/__init__.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/causal/cate_learning/__init__.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/causal/cate_learning/double_machine_learning.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/causal/debias.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/causal/effects.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/causal/validation/__init__.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/causal/validation/auc.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/causal/validation/cate.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/causal/validation/curves.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/common_docstrings.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/data/__init__.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/data/datasets.py +0 -0
- {fklearn-2.2.0/src/fklearn/metrics → fklearn-2.3.0/src/fklearn/exceptions}/__init__.py +0 -0
- {fklearn-2.2.0/src/fklearn/preprocessing → fklearn-2.3.0/src/fklearn/metrics}/__init__.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/metrics/pd_extractors.py +0 -0
- {fklearn-2.2.0/src/fklearn/training → fklearn-2.3.0/src/fklearn/preprocessing}/__init__.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/preprocessing/rebalancing.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/preprocessing/schema.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/preprocessing/splitting.py +0 -0
- {fklearn-2.2.0/src/fklearn/tuning → fklearn-2.3.0/src/fklearn/training}/__init__.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/training/calibration.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/training/ensemble.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/training/imputation.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/training/pipeline.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/training/regression.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/training/utils.py +0 -0
- {fklearn-2.2.0/src/fklearn/validation → fklearn-2.3.0/src/fklearn/tuning}/__init__.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/tuning/model_agnostic_fc.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/tuning/parameter_tuners.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/tuning/samplers.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/tuning/selectors.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/tuning/stoppers.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/tuning/utils.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/types/__init__.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/types/types.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/validation/perturbators.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn/version.py +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn.egg-info/dependency_links.txt +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn.egg-info/not-zip-safe +0 -0
- {fklearn-2.2.0 → fklearn-2.3.0}/src/fklearn.egg-info/top_level.txt +0 -0
fklearn-2.3.0/PKG-INFO
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: fklearn
|
|
3
|
+
Version: 2.3.0
|
|
4
|
+
Summary: Functional machine learning
|
|
5
|
+
Home-page: https://github.com/nubank/fklearn
|
|
6
|
+
Author: Nubank
|
|
7
|
+
Classifier: Programming Language :: Python :: 3.6
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
11
|
+
Requires-Python: >=3.6.2,<3.10
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
Provides-Extra: test_deps
|
|
14
|
+
Provides-Extra: lgbm
|
|
15
|
+
Provides-Extra: xgboost
|
|
16
|
+
Provides-Extra: catboost
|
|
17
|
+
Provides-Extra: tools
|
|
18
|
+
Provides-Extra: devel
|
|
19
|
+
Provides-Extra: all_models
|
|
20
|
+
Provides-Extra: all
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
|
|
23
|
+
# fklearn: Functional Machine Learning
|
|
24
|
+
|
|
25
|
+

|
|
26
|
+
[](https://fklearn.readthedocs.io/en/latest/?badge=latest)
|
|
27
|
+
[](https://gitter.im/fklearn-python/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge)
|
|
28
|
+

|
|
29
|
+
[](https://opensource.org/licenses/Apache-2.0)
|
|
30
|
+
|
|
31
|
+
**fklearn** uses functional programming principles to make it easier to solve real problems with Machine Learning.
|
|
32
|
+
|
|
33
|
+
The name is a reference to the widely known [scikit-learn](https://scikit-learn.org/stable/) library.
|
|
34
|
+
|
|
35
|
+
**fklearn Principles**
|
|
36
|
+
|
|
37
|
+
1. Validation should reflect real-life situations.
|
|
38
|
+
2. Production models should match validated models.
|
|
39
|
+
3. Models should be production-ready with few extra steps.
|
|
40
|
+
4. Reproducibility and in-depth analysis of model results should be easy to achieve.
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
[Documentation](https://fklearn.readthedocs.io/en/latest/) |
|
|
44
|
+
[Getting Started](https://fklearn.readthedocs.io/en/latest/getting_started.html) |
|
|
45
|
+
[API Docs](https://fklearn.readthedocs.io/en/latest/api/modules.html) |
|
|
46
|
+
[Contributing](https://fklearn.readthedocs.io/en/latest/contributing.html) |
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
## Installation
|
|
50
|
+
|
|
51
|
+
To install via pip:
|
|
52
|
+
|
|
53
|
+
```
|
|
54
|
+
pip install fklearn
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
You can also install from the source:
|
|
58
|
+
|
|
59
|
+
```sh
|
|
60
|
+
git clone git@github.com:nubank/fklearn.git
|
|
61
|
+
cd fklearn
|
|
62
|
+
git checkout master
|
|
63
|
+
pip install -e .
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## License
|
|
67
|
+
|
|
68
|
+
[Apache License 2.0](LICENSE)
|
|
@@ -3,8 +3,7 @@
|
|
|
3
3
|

|
|
4
4
|
[](https://fklearn.readthedocs.io/en/latest/?badge=latest)
|
|
5
5
|
[](https://gitter.im/fklearn-python/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge)
|
|
6
|
-
|
|
7
|
-
[](https://codecov.io/github/nubank/fklearn)
|
|
6
|
+

|
|
8
7
|
[](https://opensource.org/licenses/Apache-2.0)
|
|
9
8
|
|
|
10
9
|
**fklearn** uses functional programming principles to make it easier to solve real problems with Machine Learning.
|
|
@@ -21,7 +20,7 @@ The name is a reference to the widely known [scikit-learn](https://scikit-learn.
|
|
|
21
20
|
|
|
22
21
|
[Documentation](https://fklearn.readthedocs.io/en/latest/) |
|
|
23
22
|
[Getting Started](https://fklearn.readthedocs.io/en/latest/getting_started.html) |
|
|
24
|
-
[API Docs](https://fklearn.readthedocs.io/en/latest/api.html) |
|
|
23
|
+
[API Docs](https://fklearn.readthedocs.io/en/latest/api/modules.html) |
|
|
25
24
|
[Contributing](https://fklearn.readthedocs.io/en/latest/contributing.html) |
|
|
26
25
|
|
|
27
26
|
|
|
@@ -26,10 +26,15 @@ all_models_deps = lgbm_deps + xgboost_deps + catboost_deps
|
|
|
26
26
|
all_deps = all_models_deps + tools_deps
|
|
27
27
|
devel_deps = test_deps + all_deps
|
|
28
28
|
|
|
29
|
+
with open("README.md", "r") as fh:
|
|
30
|
+
long_description = fh.read()
|
|
31
|
+
|
|
29
32
|
setup(name=MODULE_NAME,
|
|
30
33
|
description="Functional machine learning",
|
|
34
|
+
long_description=long_description,
|
|
35
|
+
long_description_content_type="text/markdown",
|
|
31
36
|
url='https://github.com/nubank/{:s}'.format(REPO_NAME),
|
|
32
|
-
python_requires='>=3.6.2',
|
|
37
|
+
python_requires='>=3.6.2,<3.10',
|
|
33
38
|
author="Nubank",
|
|
34
39
|
package_dir={'': 'src'},
|
|
35
40
|
packages=find_packages('src'),
|
|
@@ -46,5 +51,9 @@ setup(name=MODULE_NAME,
|
|
|
46
51
|
"all": all_deps},
|
|
47
52
|
include_package_data=True,
|
|
48
53
|
zip_safe=False,
|
|
49
|
-
classifiers=[
|
|
50
|
-
|
|
54
|
+
classifiers=[
|
|
55
|
+
'Programming Language :: Python :: 3.6',
|
|
56
|
+
'Programming Language :: Python :: 3.7',
|
|
57
|
+
'Programming Language :: Python :: 3.8',
|
|
58
|
+
'Programming Language :: Python :: 3.9'
|
|
59
|
+
])
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import copy
|
|
2
2
|
import inspect
|
|
3
|
-
from typing import Callable, List, Tuple
|
|
3
|
+
from typing import Callable, Dict, List, Tuple
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import pandas as pd
|
|
@@ -194,25 +194,19 @@ def causal_s_classification_learner(
|
|
|
194
194
|
|
|
195
195
|
Parameters
|
|
196
196
|
----------
|
|
197
|
-
|
|
198
197
|
df : pd.DataFrame
|
|
199
198
|
A Pandas' DataFrame with features and target columns.
|
|
200
199
|
The model will be trained to predict the target column
|
|
201
200
|
from the features.
|
|
202
|
-
|
|
203
201
|
treatment_col: str
|
|
204
202
|
The name of the column in `df` which contains the names of
|
|
205
203
|
the treatments or control to which each data sample was subjected.
|
|
206
|
-
|
|
207
204
|
control_name: str
|
|
208
205
|
The name of the control group.
|
|
209
|
-
|
|
210
206
|
prediction_column : str
|
|
211
207
|
The name of the column with the predictions from the provided learner.
|
|
212
|
-
|
|
213
208
|
learner: Callable
|
|
214
209
|
A fklearn classification learner function.
|
|
215
|
-
|
|
216
210
|
learner_transformers: list
|
|
217
211
|
A list of fklearn transformer functions to be applied after the learner and before estimating the CATE.
|
|
218
212
|
This parameter may be useful, for example, to estimate the CATE with calibrated classifiers.
|
|
@@ -266,3 +260,189 @@ def causal_s_classification_learner(
|
|
|
266
260
|
causal_s_classification_learner.__doc__ += learner_return_docstring(
|
|
267
261
|
"Causal S-Learner Classifier"
|
|
268
262
|
)
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def _simulate_t_learner_treatment_effect(
|
|
266
|
+
df: pd.DataFrame,
|
|
267
|
+
learners: dict,
|
|
268
|
+
treatments: list,
|
|
269
|
+
control_name: str,
|
|
270
|
+
prediction_column: str,
|
|
271
|
+
) -> pd.DataFrame:
|
|
272
|
+
control_fcn = learners[control_name]
|
|
273
|
+
control_conversion_probability = control_fcn(df)[prediction_column].values
|
|
274
|
+
|
|
275
|
+
scored_df = df.copy()
|
|
276
|
+
|
|
277
|
+
uplift_cols = []
|
|
278
|
+
for treatment_name in treatments:
|
|
279
|
+
treatment_fcn = learners[treatment_name]
|
|
280
|
+
treatment_conversion_probability = treatment_fcn(df)[prediction_column].values
|
|
281
|
+
|
|
282
|
+
scored_df[
|
|
283
|
+
f"treatment_{treatment_name}__{prediction_column}_on_treatment"
|
|
284
|
+
] = treatment_conversion_probability
|
|
285
|
+
|
|
286
|
+
uplift_cols.append(f"treatment_{treatment_name}__uplift")
|
|
287
|
+
scored_df[uplift_cols[-1]] = (
|
|
288
|
+
treatment_conversion_probability - control_conversion_probability
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
scored_df["uplift"] = scored_df[uplift_cols].max(axis=1).values
|
|
292
|
+
scored_df["suggested_treatment"] = np.where(
|
|
293
|
+
scored_df["uplift"].values <= 0,
|
|
294
|
+
control_name,
|
|
295
|
+
scored_df[uplift_cols].idxmax(axis=1).values,
|
|
296
|
+
)
|
|
297
|
+
scored_df["suggested_treatment"] = (
|
|
298
|
+
scored_df["suggested_treatment"]
|
|
299
|
+
.apply(lambda x: x.replace("__uplift", ""))
|
|
300
|
+
.values
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
return scored_df
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def _get_model_fcn(
|
|
307
|
+
df: pd.DataFrame,
|
|
308
|
+
treatment_col: str,
|
|
309
|
+
treatment_name: str,
|
|
310
|
+
learner: Callable,
|
|
311
|
+
) -> Tuple[Callable, dict, dict]:
|
|
312
|
+
"""
|
|
313
|
+
Returns a function that predicts the target column from the features.
|
|
314
|
+
"""
|
|
315
|
+
|
|
316
|
+
treatment_names = df[treatment_col].unique()
|
|
317
|
+
|
|
318
|
+
if treatment_name not in treatment_names:
|
|
319
|
+
raise MissingTreatmentError()
|
|
320
|
+
|
|
321
|
+
df = df.loc[df[treatment_col] == treatment_name].reset_index(drop=True).copy()
|
|
322
|
+
|
|
323
|
+
return learner(df)
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def _get_learners(
|
|
327
|
+
df: pd.DataFrame,
|
|
328
|
+
control_learner: Callable,
|
|
329
|
+
treatment_learner: Callable,
|
|
330
|
+
unique_treatments: List[str],
|
|
331
|
+
control_name: str,
|
|
332
|
+
treatment_col: str,
|
|
333
|
+
) -> Tuple[Dict[str, Callable], Dict[str, dict]]:
|
|
334
|
+
learners: Dict[str, Callable] = {}
|
|
335
|
+
logs: Dict[str, dict] = {}
|
|
336
|
+
|
|
337
|
+
learner_fcn, _, learner_logs = _get_model_fcn(
|
|
338
|
+
df, treatment_col, control_name, control_learner
|
|
339
|
+
)
|
|
340
|
+
learners[control_name] = learner_fcn
|
|
341
|
+
logs[control_name] = learner_logs
|
|
342
|
+
|
|
343
|
+
for treatment_name in unique_treatments:
|
|
344
|
+
learner_fcn, _, learner_logs = _get_model_fcn(
|
|
345
|
+
df, treatment_col, treatment_name, treatment_learner
|
|
346
|
+
)
|
|
347
|
+
learners[treatment_name] = learner_fcn
|
|
348
|
+
logs[treatment_name] = learner_logs
|
|
349
|
+
|
|
350
|
+
return learners, logs
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
@curry
|
|
354
|
+
def causal_t_classification_learner(
|
|
355
|
+
df: pd.DataFrame,
|
|
356
|
+
treatment_col: str,
|
|
357
|
+
control_name: str,
|
|
358
|
+
prediction_column: str,
|
|
359
|
+
learner: LearnerFnType,
|
|
360
|
+
treatment_learner: LearnerFnType = None,
|
|
361
|
+
learner_transformers: List[LearnerFnType] = None,
|
|
362
|
+
) -> LearnerReturnType:
|
|
363
|
+
"""
|
|
364
|
+
Fits a Causal T-Learner classifier. The T-Learner is a meta-learner which learns the
|
|
365
|
+
Conditional Average Treatment Effect (CATE) through the use of one Machine Learning
|
|
366
|
+
model for each treatment and for the control group. Each model is fitted in a subset of
|
|
367
|
+
the data, according to the treatment: the CATE $\tau$ is defined as
|
|
368
|
+
$\tau(x_{i}) = M_{1}(X=x_{i}, T=1) - M_{0}(X=x_{i}, T=0)$, being $M_{1}$ a model fitted
|
|
369
|
+
with treatment data and $M_{0}$ a model fitted with control data. Notice that $M_{0}$
|
|
370
|
+
and $M_{1}$ are traditional Machine Learning models such as a LightGBM Classifier and
|
|
371
|
+
that $x_{i}$ is the feature set of sample $i$.
|
|
372
|
+
|
|
373
|
+
**References:**
|
|
374
|
+
|
|
375
|
+
[1] https://matheusfacure.github.io/python-causality-handbook/21-Meta-Learners.html
|
|
376
|
+
|
|
377
|
+
[2] https://causalml.readthedocs.io/en/latest/methodology.html
|
|
378
|
+
|
|
379
|
+
Parameters
|
|
380
|
+
----------
|
|
381
|
+
df : pd.DataFrame
|
|
382
|
+
A Pandas' DataFrame with features and target columns.
|
|
383
|
+
The model will be trained to predict the target column
|
|
384
|
+
from the features.
|
|
385
|
+
treatment_col: str
|
|
386
|
+
The name of the column in `df` which contains the names of
|
|
387
|
+
the treatments and control to which each data sample was subjected.
|
|
388
|
+
control_name: str
|
|
389
|
+
The name of the control group.
|
|
390
|
+
prediction_column : str
|
|
391
|
+
The name of the column with the predictions from the provided learner.
|
|
392
|
+
learner: LearnerFnType
|
|
393
|
+
A fklearn classification learner function.
|
|
394
|
+
treatment_learner: LearnerFnType
|
|
395
|
+
An optional fklearn classification learner function.
|
|
396
|
+
learner_transformers: List[LearnerFnType]
|
|
397
|
+
A list of fklearn transformer functions to be applied after the learner and before estimating the CATE.
|
|
398
|
+
This parameter may be useful, for example, to estimate the CATE with calibrated classifiers.
|
|
399
|
+
"""
|
|
400
|
+
|
|
401
|
+
control_learner = copy.deepcopy(learner)
|
|
402
|
+
|
|
403
|
+
if treatment_learner is None:
|
|
404
|
+
treatment_learner = copy.deepcopy(learner)
|
|
405
|
+
|
|
406
|
+
# pipeline
|
|
407
|
+
if learner_transformers is not None:
|
|
408
|
+
learner_transformers = copy.deepcopy(learner_transformers)
|
|
409
|
+
control_learner_pipe = build_pipeline(*[control_learner] + learner_transformers)
|
|
410
|
+
|
|
411
|
+
treatment_learner_pipe = build_pipeline(
|
|
412
|
+
*[treatment_learner] + learner_transformers
|
|
413
|
+
)
|
|
414
|
+
else:
|
|
415
|
+
control_learner_pipe = copy.deepcopy(control_learner)
|
|
416
|
+
treatment_learner_pipe = copy.deepcopy(treatment_learner)
|
|
417
|
+
|
|
418
|
+
# learners
|
|
419
|
+
unique_treatments = _get_unique_treatments(df, treatment_col, control_name)
|
|
420
|
+
|
|
421
|
+
learners, learners_logs = _get_learners(
|
|
422
|
+
df=df,
|
|
423
|
+
control_learner=control_learner_pipe,
|
|
424
|
+
treatment_learner=treatment_learner_pipe,
|
|
425
|
+
unique_treatments=unique_treatments,
|
|
426
|
+
control_name=control_name,
|
|
427
|
+
treatment_col=treatment_col,
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
def p(new_df: pd.DataFrame) -> pd.DataFrame:
|
|
431
|
+
return _simulate_t_learner_treatment_effect(
|
|
432
|
+
new_df,
|
|
433
|
+
learners,
|
|
434
|
+
unique_treatments,
|
|
435
|
+
control_name,
|
|
436
|
+
prediction_column,
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
p.__doc__ = learner_pred_fn_docstring("causal_t_classification_learner")
|
|
440
|
+
|
|
441
|
+
log = {"causal_t_classification_learner": {**learners_logs}}
|
|
442
|
+
|
|
443
|
+
return p, p(df), log
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
causal_t_classification_learner.__doc__ += learner_return_docstring(
|
|
447
|
+
"Causal T-Learner Classifier"
|
|
448
|
+
)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class MultipleTreatmentsError(Exception):
|
|
5
|
+
def __init__(
|
|
6
|
+
self,
|
|
7
|
+
msg: str = "Data contains multiple treatments.",
|
|
8
|
+
*args: List[Any],
|
|
9
|
+
**kwargs: Dict[str, Any]
|
|
10
|
+
) -> None:
|
|
11
|
+
super().__init__(msg, *args, **kwargs)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class MissingControlError(Exception):
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
msg: str = "Data does not contain the specified control.",
|
|
18
|
+
*args: List[Any],
|
|
19
|
+
**kwargs: Dict[str, Any]
|
|
20
|
+
) -> None:
|
|
21
|
+
super().__init__(msg, *args, **kwargs)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class MissingTreatmentError(Exception):
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
msg: str = "Data does not contain the specified treatment.",
|
|
28
|
+
*args: List[Any],
|
|
29
|
+
**kwargs: Dict[str, Any]
|
|
30
|
+
) -> None:
|
|
31
|
+
super().__init__(msg, *args, **kwargs)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
2.3.0
|
|
@@ -1,7 +1,9 @@
|
|
|
1
|
-
from typing import List, Any
|
|
1
|
+
from typing import List, Any, Optional, Callable, Tuple, Union
|
|
2
2
|
|
|
3
3
|
import numpy as np
|
|
4
4
|
import pandas as pd
|
|
5
|
+
from lightgbm import Booster
|
|
6
|
+
from pathlib import Path
|
|
5
7
|
from toolz import curry, merge, assoc
|
|
6
8
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
7
9
|
from sklearn.linear_model import LogisticRegression
|
|
@@ -233,7 +235,7 @@ xgb_classification_learner.__doc__ += learner_return_docstring("XGboost Classifi
|
|
|
233
235
|
@curry
|
|
234
236
|
def _get_catboost_shap_values(df: pd.DataFrame, cbr: Any,
|
|
235
237
|
features: List, target: str,
|
|
236
|
-
weights: List, cat_features: List) -> np.
|
|
238
|
+
weights: List, cat_features: List) -> np.ndarray:
|
|
237
239
|
"""
|
|
238
240
|
Auxiliar method to allow us to get shap values for Catboost multiclass models
|
|
239
241
|
|
|
@@ -446,7 +448,7 @@ def nlp_logistic_classification_learner(df: pd.DataFrame,
|
|
|
446
448
|
"""
|
|
447
449
|
|
|
448
450
|
# set default params
|
|
449
|
-
default_vect_params = {"strip_accents": "unicode", "min_df":
|
|
451
|
+
default_vect_params = {"strip_accents": "unicode", "min_df": 1}
|
|
450
452
|
merged_vect_params = default_vect_params if not vectorizer_params else merge(default_vect_params, vectorizer_params)
|
|
451
453
|
|
|
452
454
|
default_clf_params = {"C": 0.1, "multi_class": "ovr", "solver": "liblinear"}
|
|
@@ -501,10 +503,24 @@ def lgbm_classification_learner(df: pd.DataFrame,
|
|
|
501
503
|
target: str,
|
|
502
504
|
learning_rate: float = 0.1,
|
|
503
505
|
num_estimators: int = 100,
|
|
504
|
-
extra_params: LogType = None,
|
|
506
|
+
extra_params: Optional[LogType] = None,
|
|
505
507
|
prediction_column: str = "prediction",
|
|
506
|
-
weight_column: str = None,
|
|
507
|
-
encode_extra_cols: bool = True
|
|
508
|
+
weight_column: Optional[str] = None,
|
|
509
|
+
encode_extra_cols: bool = True,
|
|
510
|
+
valid_sets: Optional[List[pd.DataFrame]] = None,
|
|
511
|
+
valid_names: Optional[List[str]] = None,
|
|
512
|
+
feval: Optional[Union[
|
|
513
|
+
Callable[[np.ndarray, pd.DataFrame], Tuple[str, float, bool]],
|
|
514
|
+
List[Callable[[np.ndarray, pd.DataFrame], Tuple[str, float, bool]]]]
|
|
515
|
+
] = None,
|
|
516
|
+
init_model: Optional[Union[str, Path, Booster]] = None,
|
|
517
|
+
feature_name: Union[List[str], str] = 'auto',
|
|
518
|
+
categorical_feature: Union[List[str], List[int], str] = 'auto',
|
|
519
|
+
keep_training_booster: bool = False,
|
|
520
|
+
callbacks: Optional[List[Callable]] = None,
|
|
521
|
+
dataset_init_score: Optional[Union[
|
|
522
|
+
List, List[List], np.ndarray, pd.Series, pd.DataFrame]
|
|
523
|
+
] = None) -> LearnerReturnType:
|
|
508
524
|
"""
|
|
509
525
|
Fits an LGBM classifier to the dataset.
|
|
510
526
|
|
|
@@ -557,6 +573,46 @@ def lgbm_classification_learner(df: pd.DataFrame,
|
|
|
557
573
|
|
|
558
574
|
encode_extra_cols : bool (default: True)
|
|
559
575
|
If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns.
|
|
576
|
+
|
|
577
|
+
valid_sets : list of pandas.DataFrame, optional (default=None)
|
|
578
|
+
A list of datasets to be used for early-stopping during training.
|
|
579
|
+
|
|
580
|
+
valid_names : list of strings, optional (default=None)
|
|
581
|
+
A list of dataset names matching the list of datasets provided through the ``valid_sets`` parameter.
|
|
582
|
+
|
|
583
|
+
feval : callable, list of callable, or None, optional (default=None)
|
|
584
|
+
Customized evaluation function. Each evaluation function should accept two parameters: preds, eval_data, and
|
|
585
|
+
return (eval_name, eval_result, is_higher_better) or list of such tuples.
|
|
586
|
+
|
|
587
|
+
init_model : str, pathlib.Path, Booster or None, optional (default=None)
|
|
588
|
+
Filename of LightGBM model or Booster instance used for continue training.
|
|
589
|
+
|
|
590
|
+
feature_name : list of str, or 'auto', optional (default="auto")
|
|
591
|
+
Feature names. If ‘auto’ and data is pandas DataFrame, data columns names are used.
|
|
592
|
+
|
|
593
|
+
categorical_feature : list of str or int, or 'auto', optional (default="auto")
|
|
594
|
+
Categorical features. If list of int, interpreted as indices. If list of str, interpreted as feature names (need
|
|
595
|
+
to specify feature_name as well). If ‘auto’ and data is pandas DataFrame, pandas unordered categorical columns
|
|
596
|
+
are used. All values in categorical features will be cast to int32 and thus should be less than int32 max value
|
|
597
|
+
(2147483647). Large values could be memory consuming. Consider using consecutive integers starting from zero.
|
|
598
|
+
All negative values in categorical features will be treated as missing values. The output cannot be
|
|
599
|
+
monotonically constrained with respect to a categorical feature. Floating point numbers in categorical features
|
|
600
|
+
will be rounded towards 0.
|
|
601
|
+
|
|
602
|
+
keep_training_booster : bool, optional (default=False)
|
|
603
|
+
Whether the returned Booster will be used to keep training. If False, the returned value will be converted into
|
|
604
|
+
_InnerPredictor before returning. This means you won’t be able to use eval, eval_train or eval_valid methods of
|
|
605
|
+
the returned Booster. When your model is very large and cause the memory error, you can try to set this param to
|
|
606
|
+
True to avoid the model conversion performed during the internal call of model_to_string. You can still use
|
|
607
|
+
_InnerPredictor as init_model for future continue training.
|
|
608
|
+
|
|
609
|
+
callbacks : list of callable, or None, optional (default=None)
|
|
610
|
+
List of callback functions that are applied at each iteration. See Callbacks in LightGBM Python API for more
|
|
611
|
+
information.
|
|
612
|
+
|
|
613
|
+
dataset_init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for
|
|
614
|
+
multi-class task), or None, optional (default=None)
|
|
615
|
+
Init score for Dataset. It could be the prediction of the majority class or a prediction from any other model.
|
|
560
616
|
"""
|
|
561
617
|
|
|
562
618
|
import lightgbm as lgbm
|
|
@@ -570,9 +626,12 @@ def lgbm_classification_learner(df: pd.DataFrame,
|
|
|
570
626
|
features = features if not encode_extra_cols else expand_features_encoded(df, features)
|
|
571
627
|
|
|
572
628
|
dtrain = lgbm.Dataset(df[features].values, label=df[target], feature_name=list(map(str, features)), weight=weights,
|
|
573
|
-
silent=True)
|
|
629
|
+
silent=True, init_score=dataset_init_score)
|
|
574
630
|
|
|
575
|
-
bst = lgbm.train(params, dtrain, num_estimators
|
|
631
|
+
bst = lgbm.train(params=params, train_set=dtrain, num_boost_round=num_estimators, valid_sets=valid_sets,
|
|
632
|
+
valid_names=valid_names, feval=feval, init_model=init_model, feature_name=feature_name,
|
|
633
|
+
categorical_feature=categorical_feature, keep_training_booster=keep_training_booster,
|
|
634
|
+
callbacks=callbacks)
|
|
576
635
|
|
|
577
636
|
def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame:
|
|
578
637
|
if params["objective"] == "multiclass":
|
|
@@ -1027,7 +1027,7 @@ def missing_warner(df: pd.DataFrame, cols_list: List[str],
|
|
|
1027
1027
|
cols_without_missing = df_selected.loc[:, df_selected.isna().sum(axis=0) == 0].columns.tolist()
|
|
1028
1028
|
|
|
1029
1029
|
def p(dataset: pd.DataFrame) -> pd.DataFrame:
|
|
1030
|
-
def detailed_assignment(df: pd.DataFrame, cols_to_check: List[str]) -> np.
|
|
1030
|
+
def detailed_assignment(df: pd.DataFrame, cols_to_check: List[str]) -> np.ndarray:
|
|
1031
1031
|
cols_with_missing = np.array([np.where(df[col].isna(), col, "") for col in cols_to_check]).T
|
|
1032
1032
|
missing_by_row_list = np.array([list(filter(None, x)) for x in cols_with_missing]).reshape(-1, 1)
|
|
1033
1033
|
if missing_by_row_list.size == 0:
|
|
@@ -42,13 +42,17 @@ def isolation_forest_learner(df: pd.DataFrame,
|
|
|
42
42
|
If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns.
|
|
43
43
|
"""
|
|
44
44
|
|
|
45
|
-
|
|
45
|
+
model = IsolationForest()
|
|
46
|
+
|
|
47
|
+
default_params: Dict[str, Any] = {"n_jobs": -1, "random_state": 1729, "contamination": 0.1}
|
|
48
|
+
# Remove this when we stop supporting scikit-learn<0.24 as this param is deprecated
|
|
49
|
+
if "behaviour" in model.get_params():
|
|
50
|
+
default_params["behaviour"] = "new"
|
|
46
51
|
params = default_params if not params else merge(default_params, params)
|
|
52
|
+
model.set_params(**params)
|
|
47
53
|
|
|
48
54
|
features = features if not encode_extra_cols else expand_features_encoded(df, features)
|
|
49
55
|
|
|
50
|
-
model = IsolationForest()
|
|
51
|
-
model.set_params(**params)
|
|
52
56
|
model.fit(df[features].values)
|
|
53
57
|
|
|
54
58
|
def p(new_df: pd.DataFrame) -> pd.DataFrame:
|
|
File without changes
|