vaex-ml 0.18.3__py3-none-any.whl → 0.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vaex/ml/_version.py CHANGED
@@ -1,2 +1,2 @@
1
- __version__ = '0.18.3'
2
- __version_tuple__ = (0, 18, 3)
1
+ __version__ = '0.19.0'
2
+ __version_tuple__ = (0, 19, 0)
vaex/ml/catboost.py CHANGED
@@ -10,50 +10,49 @@ from . import generate
10
10
  import numpy as np
11
11
  import catboost
12
12
 
13
-
14
13
  @vaex.serialize.register
15
14
  @generate.register
16
15
  class CatBoostModel(state.HasState):
17
16
  '''The CatBoost algorithm.
18
17
 
19
- This class provides an interface to the CatBoost aloritham.
20
- CatBoost is a fast, scalable, high performance Gradient Boosting on
21
- Decision Trees library, used for ranking, classification, regression and
22
- other machine learning tasks. For more information please visit
23
- https://github.com/catboost/catboost
24
-
25
- Example:
26
-
27
- >>> import vaex
28
- >>> import vaex.ml.catboost
29
- >>> df = vaex.datasets.iris()
30
- >>> features = ['sepal_width', 'petal_length', 'sepal_length', 'petal_width']
31
- >>> df_train, df_test = df.ml.train_test_split()
32
- >>> params = {
33
- 'leaf_estimation_method': 'Gradient',
34
- 'learning_rate': 0.1,
35
- 'max_depth': 3,
36
- 'bootstrap_type': 'Bernoulli',
37
- 'objective': 'MultiClass',
38
- 'eval_metric': 'MultiClass',
39
- 'subsample': 0.8,
40
- 'random_state': 42,
41
- 'verbose': 0}
42
- >>> booster = vaex.ml.catboost.CatBoostModel(features=features, target='class_', num_boost_round=100, params=params)
43
- >>> booster.fit(df_train)
44
- >>> df_train = booster.transform(df_train)
45
- >>> df_train.head(3)
46
- # sepal_length sepal_width petal_length petal_width class_ catboost_prediction
47
- 0 5.4 3 4.5 1.5 1 [0.00615039 0.98024259 0.01360702]
48
- 1 4.8 3.4 1.6 0.2 0 [0.99034267 0.00526382 0.0043935 ]
49
- 2 6.9 3.1 4.9 1.5 1 [0.00688241 0.95190908 0.04120851]
50
- >>> df_test = booster.transform(df_test)
51
- >>> df_test.head(3)
52
- # sepal_length sepal_width petal_length petal_width class_ catboost_prediction
53
- 0 5.9 3 4.2 1.5 1 [0.00464228 0.98883351 0.00652421]
54
- 1 6.1 3 4.6 1.4 1 [0.00350424 0.9882139 0.00828186]
55
- 2 6.6 2.9 4.6 1.3 1 [0.00325705 0.98891631 0.00782664]
56
- '''
18
+ This class provides an interface to the CatBoost algorithm.
19
+ CatBoost is a fast, scalable, high performance Gradient Boosting on
20
+ Decision Trees library, used for ranking, classification, regression and
21
+ other machine learning tasks. For more information please visit
22
+ https://github.com/catboost/catboost
23
+
24
+ Example:
25
+
26
+ >>> import vaex
27
+ >>> import vaex.ml.catboost
28
+ >>> df = vaex.datasets.iris()
29
+ >>> features = ['sepal_width', 'petal_length', 'sepal_length', 'petal_width']
30
+ >>> df_train, df_test = df.ml.train_test_split()
31
+ >>> params = {
32
+ 'leaf_estimation_method': 'Gradient',
33
+ 'learning_rate': 0.1,
34
+ 'max_depth': 3,
35
+ 'bootstrap_type': 'Bernoulli',
36
+ 'objective': 'MultiClass',
37
+ 'eval_metric': 'MultiClass',
38
+ 'subsample': 0.8,
39
+ 'random_state': 42,
40
+ 'verbose': 0}
41
+ >>> booster = vaex.ml.catboost.CatBoostModel(features=features, target='class_', num_boost_round=100, params=params)
42
+ >>> booster.fit(df_train)
43
+ >>> df_train = booster.transform(df_train)
44
+ >>> df_train.head(3)
45
+ # sepal_length sepal_width petal_length petal_width class_ catboost_prediction
46
+ 0 5.4 3 4.5 1.5 1 [0.00615039 0.98024259 0.01360702]
47
+ 1 4.8 3.4 1.6 0.2 0 [0.99034267 0.00526382 0.0043935 ]
48
+ 2 6.9 3.1 4.9 1.5 1 [0.00688241 0.95190908 0.04120851]
49
+ >>> df_test = booster.transform(df_test)
50
+ >>> df_test.head(3)
51
+ # sepal_length sepal_width petal_length petal_width class_ catboost_prediction
52
+ 0 5.9 3 4.2 1.5 1 [0.00464228 0.98883351 0.00652421]
53
+ 1 6.1 3 4.6 1.4 1 [0.00350424 0.9882139 0.00828186]
54
+ 2 6.6 2.9 4.6 1.3 1 [0.00325705 0.98891631 0.00782664]
55
+ '''
57
56
  snake_name = "catboost_model"
58
57
  features = traitlets.List(traitlets.Unicode(), help='List of features to use when fitting the CatBoostModel.')
59
58
  target = traitlets.Unicode(allow_none=False, help='The name of the target column.')
vaex/ml/lightgbm.py CHANGED
@@ -86,7 +86,7 @@ class LightGBMModel(state.HasState):
86
86
  copy.add_virtual_column(self.prediction_name, expression, unique=False)
87
87
  return copy
88
88
 
89
- def fit(self, df, valid_sets=None, valid_names=None, early_stopping_rounds=None, evals_result=None, verbose_eval=None, **kwargs):
89
+ def fit(self, df, valid_sets=None, valid_names=None, early_stopping_rounds=None, evals_result=None, verbose_eval=False, **kwargs):
90
90
  """Fit the LightGBMModel to the DataFrame.
91
91
 
92
92
  The model will train until the validation score stops improving.
@@ -112,14 +112,19 @@ class LightGBMModel(state.HasState):
112
112
  else:
113
113
  valid_sets = ()
114
114
 
115
+ callbacks = [
116
+ lightgbm.callback.record_evaluation(eval_result=evals_result) if evals_result is not None else None,
117
+ lightgbm.callback.early_stopping(stopping_rounds=early_stopping_rounds) if early_stopping_rounds else None,
118
+ lightgbm.callback.log_evaluation() if verbose_eval else None
119
+ ]
120
+ callbacks = [callback for callback in callbacks if callback is not None]
121
+
115
122
  self.booster = lightgbm.train(params=self.params,
116
123
  train_set=dtrain,
117
124
  num_boost_round=self.num_boost_round,
118
125
  valid_sets=valid_sets,
119
126
  valid_names=valid_names,
120
- early_stopping_rounds=early_stopping_rounds,
121
- evals_result=evals_result,
122
- verbose_eval=verbose_eval,
127
+ callbacks=callbacks,
123
128
  **kwargs)
124
129
 
125
130
  def predict(self, df, **kwargs):
vaex/ml/spec.json CHANGED
@@ -940,7 +940,7 @@
940
940
  },
941
941
  {
942
942
  "classname": "CatBoostModel",
943
- "doc": "The CatBoost algorithm.\n\n This class provides an interface to the CatBoost aloritham.\n CatBoost is a fast, scalable, high performance Gradient Boosting on\n Decision Trees library, used for ranking, classification, regression and\n other machine learning tasks. For more information please visit\n https://github.com/catboost/catboost\n\n Example:\n\n >>> import vaex\n >>> import vaex.ml.catboost\n >>> df = vaex.datasets.iris()\n >>> features = ['sepal_width', 'petal_length', 'sepal_length', 'petal_width']\n >>> df_train, df_test = df.ml.train_test_split()\n >>> params = {\n 'leaf_estimation_method': 'Gradient',\n 'learning_rate': 0.1,\n 'max_depth': 3,\n 'bootstrap_type': 'Bernoulli',\n 'objective': 'MultiClass',\n 'eval_metric': 'MultiClass',\n 'subsample': 0.8,\n 'random_state': 42,\n 'verbose': 0}\n >>> booster = vaex.ml.catboost.CatBoostModel(features=features, target='class_', num_boost_round=100, params=params)\n >>> booster.fit(df_train)\n >>> df_train = booster.transform(df_train)\n >>> df_train.head(3)\n # sepal_length sepal_width petal_length petal_width class_ catboost_prediction\n 0 5.4 3 4.5 1.5 1 [0.00615039 0.98024259 0.01360702]\n 1 4.8 3.4 1.6 0.2 0 [0.99034267 0.00526382 0.0043935 ]\n 2 6.9 3.1 4.9 1.5 1 [0.00688241 0.95190908 0.04120851]\n >>> df_test = booster.transform(df_test)\n >>> df_test.head(3)\n # sepal_length sepal_width petal_length petal_width class_ catboost_prediction\n 0 5.9 3 4.2 1.5 1 [0.00464228 0.98883351 0.00652421]\n 1 6.1 3 4.6 1.4 1 [0.00350424 0.9882139 0.00828186]\n 2 6.6 2.9 4.6 1.3 1 [0.00325705 0.98891631 0.00782664]\n ",
943
+ "doc": "The CatBoost algorithm.\n\nThis class provides an interface to the CatBoost algorithm.\nCatBoost is a fast, scalable, high performance Gradient Boosting on\nDecision Trees library, used for ranking, classification, regression and\nother machine learning tasks. For more information please visit\nhttps://github.com/catboost/catboost\n\nExample:\n\n>>> import vaex\n>>> import vaex.ml.catboost\n>>> df = vaex.datasets.iris()\n>>> features = ['sepal_width', 'petal_length', 'sepal_length', 'petal_width']\n>>> df_train, df_test = df.ml.train_test_split()\n>>> params = {\n 'leaf_estimation_method': 'Gradient',\n 'learning_rate': 0.1,\n 'max_depth': 3,\n 'bootstrap_type': 'Bernoulli',\n 'objective': 'MultiClass',\n 'eval_metric': 'MultiClass',\n 'subsample': 0.8,\n 'random_state': 42,\n 'verbose': 0}\n>>> booster = vaex.ml.catboost.CatBoostModel(features=features, target='class_', num_boost_round=100, params=params)\n>>> booster.fit(df_train)\n>>> df_train = booster.transform(df_train)\n>>> df_train.head(3)\n# sepal_length sepal_width petal_length petal_width class_ catboost_prediction\n0 5.4 3 4.5 1.5 1 [0.00615039 0.98024259 0.01360702]\n1 4.8 3.4 1.6 0.2 0 [0.99034267 0.00526382 0.0043935 ]\n2 6.9 3.1 4.9 1.5 1 [0.00688241 0.95190908 0.04120851]\n>>> df_test = booster.transform(df_test)\n>>> df_test.head(3)\n# sepal_length sepal_width petal_length petal_width class_ catboost_prediction\n0 5.9 3 4.2 1.5 1 [0.00464228 0.98883351 0.00652421]\n1 6.1 3 4.6 1.4 1 [0.00350424 0.9882139 0.00828186]\n2 6.6 2.9 4.6 1.3 1 [0.00325705 0.98891631 0.00782664]\n",
944
944
  "module": "vaex.ml.catboost",
945
945
  "snake_name": "catboost_model",
946
946
  "traits": [
vaex/ml/tensorflow.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import base64
2
2
  import tempfile
3
- import shutil
3
+ import os.path
4
4
 
5
5
  import numpy as np
6
6
 
@@ -95,7 +95,10 @@ class DataFrameAccessorTensorflow():
95
95
  chunk_shape = len(chunk[0].shape) + 1
96
96
  transpose_order = np.arange(1, chunk_shape).tolist() + [0]
97
97
  X = np.array(chunk[:-n_target_cols]).transpose(transpose_order)
98
- y = np.array(chunk[-n_target_cols:], copy=False).T
98
+ if np.lib.NumpyVersion(np.__version__) >= '2.0.0':
99
+ y = np.array(chunk[-n_target_cols:]).T
100
+ else:
101
+ y = np.array(chunk[-n_target_cols:], copy=False).T
99
102
  yield (X, y)
100
103
 
101
104
  else:
@@ -177,13 +180,12 @@ class KerasModel(vaex.ml.state.HasState):
177
180
 
178
181
  def state_get(self):
179
182
  state = super(KerasModel, self).state_get()
180
-
181
183
  with tempfile.TemporaryDirectory() as directory:
182
- self.model.save(directory)
183
- zip_path = tempfile.mktemp(".zip")
184
- shutil.make_archive(zip_path[:-4], 'zip', directory)
185
- with open(zip_path, 'rb') as f:
184
+ filepath = os.path.join(directory, 'model.keras')
185
+ self.model.save(filepath)
186
+ with open(filepath, 'rb') as f:
186
187
  data = f.read()
188
+ # store model as raw zip bytes base64 encoded
187
189
  state['model'] = base64.encodebytes(data).decode('ascii')
188
190
  return state
189
191
 
@@ -194,8 +196,7 @@ class KerasModel(vaex.ml.state.HasState):
194
196
 
195
197
  data = base64.decodebytes(model_data.encode('ascii'))
196
198
  with tempfile.TemporaryDirectory() as directory:
197
- zip_path = tempfile.mktemp('.zip')
198
- with open(zip_path, 'wb') as f:
199
+ filepath = os.path.join(directory, 'model.keras')
200
+ with open(filepath, 'wb') as f:
199
201
  f.write(data)
200
- shutil.unpack_archive(zip_path, directory)
201
- self.model = K.models.load_model(directory, custom_objects=self.custom_objects)
202
+ self.model = K.models.load_model(filepath, custom_objects=self.custom_objects)
@@ -423,7 +423,7 @@ class OneHotEncoder(Transformer):
423
423
  :rtype: DataFrame
424
424
  '''
425
425
  copy = df.copy()
426
- downcast_uint8 = np.can_cast(self.one, np.uint8) and np.can_cast(self.zero, np.uint8)
426
+ downcast_uint8 = np.can_cast(np.min_scalar_type(self.one), np.uint8) and np.can_cast(np.min_scalar_type(self.zero), np.uint8)
427
427
  dtype = 'uint8' if downcast_uint8 else None
428
428
  # for each feature, add a virtual column for each unique entry
429
429
  for i, feature in enumerate(self.features):
@@ -432,7 +432,7 @@ class OneHotEncoder(Transformer):
432
432
  column_name = self.prefix + feature + '_' + str_value
433
433
  if value is None:
434
434
  copy[column_name] = copy.func.where(copy[feature].ismissing(), self.one, self.zero, dtype=dtype)
435
- elif isinstance(value, np.float) and np.isnan(value):
435
+ elif isinstance(value, float) and np.isnan(value):
436
436
  copy[column_name] = copy.func.where(copy[feature].isnan(), self.one, self.zero, dtype=dtype)
437
437
  else:
438
438
  copy[column_name] = copy.func.where(copy[feature] == value, self.one, self.zero, dtype=dtype)
@@ -0,0 +1,34 @@
1
+ Metadata-Version: 2.4
2
+ Name: vaex-ml
3
+ Version: 0.19.0
4
+ Summary: Machine learning support for vaex
5
+ Home-page: https://www.github.com/vaexio/vaex
6
+ Author: Jovan Veljanoski
7
+ Author-email: jovan.veljanoski@gmail.com
8
+ License: MIT
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE.txt
11
+ Requires-Dist: vaex-core~=4.8
12
+ Requires-Dist: numba
13
+ Requires-Dist: traitlets
14
+ Requires-Dist: jinja2
15
+ Requires-Dist: annoy
16
+ Requires-Dist: scikit-learn
17
+ Requires-Dist: xgboost
18
+ Requires-Dist: lightgbm~=4.0
19
+ Requires-Dist: catboost>=1.2.8
20
+ Provides-Extra: all
21
+ Requires-Dist: tensorflow~=2.18; extra == "all"
22
+ Requires-Dist: tensorflow<2.20.0; sys_platform == "darwin" and extra == "all"
23
+ Dynamic: author
24
+ Dynamic: author-email
25
+ Dynamic: description
26
+ Dynamic: description-content-type
27
+ Dynamic: home-page
28
+ Dynamic: license
29
+ Dynamic: license-file
30
+ Dynamic: provides-extra
31
+ Dynamic: requires-dist
32
+ Dynamic: summary
33
+
34
+ Machine learning support for vaex
@@ -1,27 +1,27 @@
1
1
  vaex/ml/__init__.py,sha256=cg8TmMc1KEJnG9YknEN8VpJ4ikEMcpqnrvxnwMR9NT4,4263
2
- vaex/ml/_version.py,sha256=9RC3H0aZm8vW4qzbQ2Z2CVmQm3ClG5wATjqUqK1MMJ0,54
3
- vaex/ml/catboost.py,sha256=x81_1YXov_JfvgG2d-EArlfxPQLx8sAfl_b8I0TT1LI,10312
2
+ vaex/ml/_version.py,sha256=xB41s6qdt06iHS6CO_taADXuaLN_zxOmjAJox-9eQAk,54
3
+ vaex/ml/catboost.py,sha256=gFdDfzCKKUqXTf383PNhINOtI3HdWU_0S33tbSWRZa8,10167
4
4
  vaex/ml/cluster.py,sha256=SXnEB1tZZFstC6lqqPI-Y9UjUTaAUYEKJCMsi59Jp-M,10295
5
5
  vaex/ml/generate.py,sha256=ZH3ekTUmmq74adiJLT29aA8rsICQyGw4pCddkGnF2nQ,2762
6
- vaex/ml/lightgbm.py,sha256=4Vdj3A5gsq7BOrRvrQqHwac0HeNkg6dR9qlAHxWyjlQ,7641
6
+ vaex/ml/lightgbm.py,sha256=vcANvNpK0TYNPOFXbMsCg04Uhl4_AWO0rZQZvzCAKQc,7908
7
7
  vaex/ml/linear_model.py,sha256=tx55oPrSfbvH2VwOJ_j5pUPyzsDTOgBlEjfVBZOaf2E,4055
8
8
  vaex/ml/metrics.py,sha256=gFpxpE9eNxuF-09Q9X__Tz5VZ_v51Rr0Zod9VtCDxTg,20416
9
9
  vaex/ml/pipeline.py,sha256=61w3SINePdyZHbyk1q0T1lNQ3W4wMV56sjCratBEAwU,1001
10
10
  vaex/ml/sklearn.py,sha256=AKhkzSs1XLi6VPPCQ77xkpOX8EobgQzBgiPUrbuhddo,11671
11
- vaex/ml/spec.json,sha256=gPajtlUh4pVk1Zcnq-J9wRywZq0H1P3pJYdQd5O177M,71014
11
+ vaex/ml/spec.json,sha256=1-kdehd6auQHVeBoKG2TCne14MPoIitvKrfgFpBh0NE,70870
12
12
  vaex/ml/spec.py,sha256=24jkgxIGzzNhsxmir1bIltZQ8ASw-lBReeXF1TyYh_o,1350
13
13
  vaex/ml/state.py,sha256=0g59xmYuJeRwBLWa0YiC9QqAdx2aaL6EG9oZoQ1E7HU,1570
14
- vaex/ml/tensorflow.py,sha256=tOQARvEWOy6cLoWy27HvtTMUzhhqmDjLrWSP-xr4B7U,9004
15
- vaex/ml/transformations.py,sha256=2dkWOcfSIeG_pOhVNomKhwgX6brevxYEHvgA_tgv_1c,51292
14
+ vaex/ml/tensorflow.py,sha256=4xzBgxwUUSgK59OEoezu7DhvLAi9kkt_BjwwmGU9C9A,9146
15
+ vaex/ml/transformations.py,sha256=xJ_ZM9SArDCXnv1NX2Y_RRPfzUHsE2NPvQ_aDCjCxGA,51329
16
16
  vaex/ml/ui.py,sha256=Zg9PWISoUTwRBIaM-4CuezVzIdpSegxNik0DYQJZVEQ,1902
17
17
  vaex/ml/xgboost.py,sha256=XxgzKBRiUTDbkaCxe4Ck5seLG1rlk7u7lLIa_uvjojU,7031
18
18
  vaex/ml/incubator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
19
  vaex/ml/incubator/annoy.py,sha256=ZhxT7_tqeZztOmmv29aphRsITbka2ZTgq5bY0YufnhI,3336
20
20
  vaex/ml/incubator/pygbm.py,sha256=QQsbYXZ6HaYl5K1i80S3lHtdVMbo2pWKdwMEYg1EqPg,5033
21
21
  vaex/ml/incubator/river.py,sha256=1L6_vfRZY1FubyxXbeF_YKtEbq3heafpG2E3SbL8AAY,6106
22
- vaex_ml-0.18.3.dist-info/LICENSE.txt,sha256=Ub-0nSwxfezlH092uDbwZqynm9i5wq353cMYNPy5OqY,1087
23
- vaex_ml-0.18.3.dist-info/METADATA,sha256=RtbGD5eY6dC4xFjQdS5LTXoR4JlktMHyJWTcNIdL8yg,485
24
- vaex_ml-0.18.3.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
25
- vaex_ml-0.18.3.dist-info/entry_points.txt,sha256=kWLzYdLs3kYqmlZLcAg2M-GVMEftg1VzvTWx92u7dtk,177
26
- vaex_ml-0.18.3.dist-info/top_level.txt,sha256=BTcHbKEpfXqVlITrjFEtm95UDejFyYXslGl9YUqTIpc,5
27
- vaex_ml-0.18.3.dist-info/RECORD,,
22
+ vaex_ml-0.19.0.dist-info/licenses/LICENSE.txt,sha256=Ub-0nSwxfezlH092uDbwZqynm9i5wq353cMYNPy5OqY,1087
23
+ vaex_ml-0.19.0.dist-info/METADATA,sha256=ry6S6Y3nMmKeX37dJQTiwgSTEuz9t_2WoUU3ckc-cfo,912
24
+ vaex_ml-0.19.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
25
+ vaex_ml-0.19.0.dist-info/entry_points.txt,sha256=cJ8A2uhjPndjLHCMesv0AcKsKK8oCbZclKR3c8yn4Lo,176
26
+ vaex_ml-0.19.0.dist-info/top_level.txt,sha256=BTcHbKEpfXqVlITrjFEtm95UDejFyYXslGl9YUqTIpc,5
27
+ vaex_ml-0.19.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.40.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -2,4 +2,3 @@
2
2
  ml = vaex.ml:DataFrameAccessorML
3
3
  ml.metrics = vaex.ml.metrics:DataFrameAccessorMetrics
4
4
  ml.tensorflow = vaex.ml.tensorflow:DataFrameAccessorTensorflow
5
-
@@ -1,20 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: vaex-ml
3
- Version: 0.18.3
4
- Summary: Machine learning support for vaex
5
- Home-page: https://www.github.com/vaexio/vaex
6
- Author: Jovan Veljanoski
7
- Author-email: jovan.veljanoski@gmail.com
8
- License: MIT
9
- Platform: UNKNOWN
10
- Requires-Dist: vaex-core (<5,>=4.8.0)
11
- Requires-Dist: numba
12
- Requires-Dist: traitlets
13
- Requires-Dist: jinja2
14
- Provides-Extra: all
15
- Requires-Dist: tensorflow (>=2.1.0) ; extra == 'all'
16
- Requires-Dist: tensorflow-io (>=0.12.0) ; extra == 'all'
17
-
18
- UNKNOWN
19
-
20
-