autogluon.tabular 1.2.1b20250407__py3-none-any.whl → 1.2.1b20250409__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autogluon/tabular/register/_ag_model_register.py +0 -2
- autogluon/tabular/version.py +1 -1
- {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/METADATA +13 -13
- {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/RECORD +11 -22
- autogluon/tabular/models/tab_transformer/__init__.py +0 -1
- autogluon/tabular/models/tab_transformer/hyperparameters/__init__.py +0 -1
- autogluon/tabular/models/tab_transformer/hyperparameters/parameters.py +0 -66
- autogluon/tabular/models/tab_transformer/hyperparameters/searchspaces.py +0 -17
- autogluon/tabular/models/tab_transformer/modified_transformer.py +0 -494
- autogluon/tabular/models/tab_transformer/pretexts.py +0 -150
- autogluon/tabular/models/tab_transformer/tab_model_base.py +0 -86
- autogluon/tabular/models/tab_transformer/tab_transformer.py +0 -183
- autogluon/tabular/models/tab_transformer/tab_transformer_encoder.py +0 -668
- autogluon/tabular/models/tab_transformer/tab_transformer_model.py +0 -540
- autogluon/tabular/models/tab_transformer/utils.py +0 -124
- /autogluon.tabular-1.2.1b20250407-py3.9-nspkg.pth → /autogluon.tabular-1.2.1b20250409-py3.9-nspkg.pth +0 -0
- {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/LICENSE +0 -0
- {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/NOTICE +0 -0
- {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/WHEEL +0 -0
- {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/namespace_packages.txt +0 -0
- {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/top_level.txt +0 -0
- {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/zip-safe +0 -0
@@ -1,668 +0,0 @@
|
|
1
|
-
# Credits for code in this script to Milan Cvitkovic,
|
2
|
-
# Xin Huang, Ashish Khetan and Zohar Karnin
|
3
|
-
|
4
|
-
import calendar
|
5
|
-
import datetime
|
6
|
-
import re
|
7
|
-
from collections import Counter
|
8
|
-
from datetime import date, datetime
|
9
|
-
from functools import partial
|
10
|
-
from typing import Iterable, Union
|
11
|
-
|
12
|
-
import numpy as np
|
13
|
-
import pandas as pd
|
14
|
-
import torch
|
15
|
-
import torch.nn as nn
|
16
|
-
from pandas import DataFrame
|
17
|
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
18
|
-
from sklearn.preprocessing import KBinsDiscretizer, PowerTransformer, QuantileTransformer, RobustScaler
|
19
|
-
|
20
|
-
|
21
|
-
class WontEncodeError(Exception):
|
22
|
-
pass
|
23
|
-
|
24
|
-
|
25
|
-
class EncBase:
|
26
|
-
cat_cards = []
|
27
|
-
cont_dim = 0
|
28
|
-
|
29
|
-
@property
|
30
|
-
def cat_dim(self):
|
31
|
-
return len(self.cat_cards)
|
32
|
-
|
33
|
-
def clean_data(self, data, dtype=None) -> list:
|
34
|
-
if isinstance(data, pd.Series):
|
35
|
-
data = data.replace({np.nan: None}).to_list()
|
36
|
-
if dtype == "float":
|
37
|
-
unclean_data = data
|
38
|
-
data = []
|
39
|
-
for i in unclean_data:
|
40
|
-
try:
|
41
|
-
data.append(float(i))
|
42
|
-
except (ValueError, TypeError):
|
43
|
-
data.append(None)
|
44
|
-
return data
|
45
|
-
|
46
|
-
def fit(self, data: pd.Series, dtype=None):
|
47
|
-
"""
|
48
|
-
If dtype == 'float', clean_data will cast the contents of data to floats
|
49
|
-
"""
|
50
|
-
if len(pd.unique(data)) == 1:
|
51
|
-
raise WontEncodeError("Column contains only one value")
|
52
|
-
data = self.clean_data(data, dtype)
|
53
|
-
return data
|
54
|
-
|
55
|
-
def enc_cat(self, data: Iterable):
|
56
|
-
raise NotImplementedError
|
57
|
-
|
58
|
-
def enc_cont(self, data: Iterable):
|
59
|
-
raise NotImplementedError
|
60
|
-
|
61
|
-
|
62
|
-
class NullEnc(EncBase):
|
63
|
-
"""
|
64
|
-
When you want to ignore a feature
|
65
|
-
"""
|
66
|
-
|
67
|
-
def fit(self, data: Iterable):
|
68
|
-
pass
|
69
|
-
|
70
|
-
def enc_cat(self, data: Iterable):
|
71
|
-
pass
|
72
|
-
|
73
|
-
def enc_cont(self, data: Iterable):
|
74
|
-
pass
|
75
|
-
|
76
|
-
|
77
|
-
class CategoricalOrdinalEnc(EncBase):
|
78
|
-
def __init__(self, sorted_items=None):
|
79
|
-
if sorted_items is not None:
|
80
|
-
assert sorted_items == sorted(sorted_items)
|
81
|
-
self.init_with_sorted_values(sorted_items)
|
82
|
-
|
83
|
-
def fit(self, data: pd.Series):
|
84
|
-
data = super().fit(data)
|
85
|
-
distinct_vals = [i for i in pd.unique(data) if i is not None]
|
86
|
-
sorted_vals = sorted(distinct_vals)
|
87
|
-
if len(sorted_vals) >= 0.5 * len(data) or max(Counter(data).values()) < 10: # sorry for hardcoding this...
|
88
|
-
raise WontEncodeError("Too many unique values to bother embedding")
|
89
|
-
self.init_with_sorted_values(sorted_vals)
|
90
|
-
|
91
|
-
def init_with_sorted_values(self, sorted_vals):
|
92
|
-
self._item_to_idx = {item: idx for idx, item in enumerate(sorted_vals, start=2)}
|
93
|
-
self._item_to_idx[None] = 1
|
94
|
-
self._item_to_idx[np.nan] = 1
|
95
|
-
self.cat_cards = [len(set(self._item_to_idx.values()))]
|
96
|
-
|
97
|
-
def enc_cat(self, data):
|
98
|
-
"""
|
99
|
-
Values that the encoder has never seen before are returned as 1. 0 is reserved for padding.
|
100
|
-
"""
|
101
|
-
data = self.clean_data(data)
|
102
|
-
idxs = [self._item_to_idx.get(item, 1) for item in data]
|
103
|
-
return torch.LongTensor(idxs).unsqueeze(1)
|
104
|
-
|
105
|
-
def enc_cont(self, data):
|
106
|
-
pass
|
107
|
-
|
108
|
-
|
109
|
-
class ScalarQuantileOrdinalEnc(EncBase):
|
110
|
-
def __init__(self, n_bins_=None, bin_edges_=None):
|
111
|
-
if n_bins_ is not None and bin_edges_ is not None:
|
112
|
-
self.disc = self.get_new_base_enc()
|
113
|
-
self.disc.n_bins_ = np.array([n_bins_])
|
114
|
-
self.disc.bin_edges_ = np.array([np.array(bin_edges_), np.array(bin_edges_[:-1])])[:1] # Dumb hack, but it's what sklearn needs
|
115
|
-
self.cat_cards = [n_bins_ + 1]
|
116
|
-
|
117
|
-
def fit(self, data):
|
118
|
-
data = super().fit(data, dtype="float")
|
119
|
-
fit_data = [i for i in data if i is not None]
|
120
|
-
fit_data = np.array(fit_data).reshape(-1, 1)
|
121
|
-
self.disc = self.get_new_base_enc()
|
122
|
-
self.disc.fit(fit_data)
|
123
|
-
self.cat_cards = [self.disc.n_bins_.item() + 1]
|
124
|
-
|
125
|
-
def enc_cat(self, data):
|
126
|
-
"""
|
127
|
-
Missing values are returned as category 1. 0 is reserved for padding.
|
128
|
-
"""
|
129
|
-
data = self.clean_data(data, dtype="float")
|
130
|
-
data = np.array(data).reshape(-1, 1)
|
131
|
-
if None in data:
|
132
|
-
idxs = np.full(len(data), -1, dtype=int)
|
133
|
-
null_idxs = np.where(data == None)[0]
|
134
|
-
val_idxs = np.where(data != None)[0]
|
135
|
-
if len(val_idxs) > 0:
|
136
|
-
vals = self.disc.transform(data[val_idxs]).reshape(-1)
|
137
|
-
idxs[val_idxs] = vals + 2
|
138
|
-
idxs[null_idxs] = 1
|
139
|
-
else:
|
140
|
-
idxs = self.disc.transform(data).reshape(-1) + 2
|
141
|
-
return torch.LongTensor(idxs).unsqueeze(1)
|
142
|
-
|
143
|
-
def enc_cont(self, data):
|
144
|
-
pass
|
145
|
-
|
146
|
-
@staticmethod
|
147
|
-
def get_new_base_enc():
|
148
|
-
return KBinsDiscretizer(n_bins=8, encode="ordinal", strategy="quantile")
|
149
|
-
|
150
|
-
def get_base_enc_params(self):
|
151
|
-
return self.disc.n_bins_, self.disc.bin_edges_
|
152
|
-
|
153
|
-
|
154
|
-
class ScalarRescaleEnc(EncBase):
|
155
|
-
cont_dim = 2
|
156
|
-
scaler = None
|
157
|
-
|
158
|
-
def enc_cat(self, data):
|
159
|
-
pass
|
160
|
-
|
161
|
-
def enc_cont(self, scalars):
|
162
|
-
"""
|
163
|
-
Returns len(scalars) x 2 tensor, where the second column is a one-hot flag for missing data values
|
164
|
-
"""
|
165
|
-
scalars = self.clean_data(scalars, dtype="float")
|
166
|
-
null_flag = np.full(len(scalars), np.nan, dtype=np.float32)
|
167
|
-
vals = np.full(len(scalars), np.nan, dtype=np.float32)
|
168
|
-
null_idxs = np.where(np.array(scalars) == None)[0]
|
169
|
-
val_idxs = np.where(np.array(scalars) != None)[0]
|
170
|
-
|
171
|
-
# One-hot flag for missing values
|
172
|
-
null_flag[null_idxs] = 1
|
173
|
-
null_flag[val_idxs] = 0
|
174
|
-
null_flag = null_flag.reshape(-1, 1)
|
175
|
-
|
176
|
-
# Transform scalar values
|
177
|
-
vals[val_idxs] = np.array(scalars, dtype=np.float32)[val_idxs]
|
178
|
-
vals = vals.reshape(-1, 1)
|
179
|
-
vals = self.scaler.transform(vals) + 1e-7 # Extra 1e-7 to help with correctness testing
|
180
|
-
vals[null_idxs] = 0
|
181
|
-
|
182
|
-
encoded = np.hstack((vals, null_flag))
|
183
|
-
encoded = encoded.clip(-5, 5) # Guarding against outlier values
|
184
|
-
return torch.FloatTensor(encoded)
|
185
|
-
|
186
|
-
@staticmethod
|
187
|
-
def get_new_base_enc():
|
188
|
-
raise NotImplementedError
|
189
|
-
|
190
|
-
def get_base_enc_params(self):
|
191
|
-
raise NotImplementedError
|
192
|
-
|
193
|
-
|
194
|
-
class ScalarRobustScalerEnc(ScalarRescaleEnc):
|
195
|
-
def __init__(self, center_=None, scale_=None):
|
196
|
-
if center_ is not None and scale_ is not None:
|
197
|
-
self.scaler = self.get_new_base_enc()
|
198
|
-
self.scaler.center_ = center_
|
199
|
-
self.scaler.scale_ = scale_
|
200
|
-
|
201
|
-
def fit(self, data: pd.Series):
|
202
|
-
data = super().fit(data, dtype="float")
|
203
|
-
data = np.array(data).reshape(-1, 1)
|
204
|
-
self.scaler = self.get_new_base_enc()
|
205
|
-
self.scaler.fit(data)
|
206
|
-
if any(sum(np.isnan(p) for p in self.get_base_enc_params())):
|
207
|
-
self.scaler.center_ = 0
|
208
|
-
self.scaler.scale_ = 1
|
209
|
-
|
210
|
-
@staticmethod
|
211
|
-
def get_new_base_enc():
|
212
|
-
return RobustScaler()
|
213
|
-
|
214
|
-
def get_base_enc_params(self):
|
215
|
-
return self.scaler.center_, self.scaler.scale_
|
216
|
-
|
217
|
-
|
218
|
-
class ScalarPowerTransformerEnc(ScalarRescaleEnc):
|
219
|
-
def __init__(self, lambdas_=None, scale_=None, mean_=None, var_=None, n_samples_seen_=None):
|
220
|
-
if all(a is not None for a in [lambdas_, scale_, mean_, var_, n_samples_seen_]):
|
221
|
-
self.scaler = self.get_new_base_enc()
|
222
|
-
self.scaler.fit([[0.0]]) # This is just to make the PowerTransformer initialize before we overwrite its params
|
223
|
-
self.scaler.lambdas_ = np.array([lambdas_])
|
224
|
-
self.scaler._scaler.scale_ = np.array([scale_])
|
225
|
-
self.scaler._scaler.mean_ = np.array([mean_])
|
226
|
-
self.scaler._scaler.var_ = np.array([var_])
|
227
|
-
self.scaler._scaler.n_samples_seen_ = n_samples_seen_
|
228
|
-
|
229
|
-
def fit(self, data):
|
230
|
-
data = super().fit(data, dtype="float")
|
231
|
-
data = np.array(data).reshape(-1, 1)
|
232
|
-
self.scaler = self.get_new_base_enc()
|
233
|
-
self.scaler.fit(data)
|
234
|
-
|
235
|
-
@staticmethod
|
236
|
-
def get_new_base_enc():
|
237
|
-
return PowerTransformer(method="yeo-johnson", standardize=True, copy=True)
|
238
|
-
|
239
|
-
def get_base_enc_params(self):
|
240
|
-
return self.scaler.lambdas_, self.scaler._scaler.scale_, self.scaler._scaler.mean_, self.scaler._scaler.var_, self.scaler._scaler.n_samples_seen_
|
241
|
-
|
242
|
-
|
243
|
-
class ScalarQuantileTransformerEnc(ScalarRescaleEnc):
|
244
|
-
def __init__(self, n_quantiles_=None, quantiles_=None, references_=None):
|
245
|
-
if all(a is not None for a in [n_quantiles_, quantiles_, references_]):
|
246
|
-
self.scaler = self.get_new_base_enc()
|
247
|
-
self.scaler.n_quantiles_ = n_quantiles_
|
248
|
-
self.scaler.quantiles_ = np.array(quantiles_).reshape(-1, 1)
|
249
|
-
self.scaler.references_ = np.array(references_)
|
250
|
-
|
251
|
-
def fit(self, data):
|
252
|
-
data = super().fit(data, dtype="float")
|
253
|
-
data = np.array(data).reshape(-1, 1)
|
254
|
-
self.scaler = self.get_new_base_enc()
|
255
|
-
self.scaler.fit(data)
|
256
|
-
|
257
|
-
@staticmethod
|
258
|
-
def get_new_base_enc():
|
259
|
-
return QuantileTransformer()
|
260
|
-
|
261
|
-
def get_base_enc_params(self):
|
262
|
-
return self.scaler.n_quantiles_, self.scaler.quantiles_, self.scaler.references_
|
263
|
-
|
264
|
-
|
265
|
-
class DatetimeScalarEnc(EncBase):
|
266
|
-
# int for type refers to the cardinality of the one-hot
|
267
|
-
cols_types = [
|
268
|
-
("Year", "float"),
|
269
|
-
("Month", 12),
|
270
|
-
("Week", 53),
|
271
|
-
("Day", 31),
|
272
|
-
("Dayofweek", 7),
|
273
|
-
("Dayofyear", "float"),
|
274
|
-
("Is_month_end", 2),
|
275
|
-
("Is_month_start", 2),
|
276
|
-
("Is_quarter_end", 2),
|
277
|
-
("Is_quarter_start", 2),
|
278
|
-
("Is_year_end", 2),
|
279
|
-
("Is_year_start", 2),
|
280
|
-
("weekday_cos", "float"),
|
281
|
-
("weekday_sin", "float"),
|
282
|
-
("day_month_cos", "float"),
|
283
|
-
("day_month_sin", "float"),
|
284
|
-
("month_year_cos", "float"),
|
285
|
-
("month_year_sin", "float"),
|
286
|
-
("day_year_cos", "float"),
|
287
|
-
("day_year_sin", "float"),
|
288
|
-
]
|
289
|
-
cont_dim = sum([n if type(n) == int else 1 for _, n in cols_types])
|
290
|
-
|
291
|
-
def enc_cat(self, data):
|
292
|
-
pass
|
293
|
-
|
294
|
-
def enc_cont(self, datetimes):
|
295
|
-
datetimes = self.clean_data(datetimes)
|
296
|
-
df = pd.DataFrame({"dt": datetimes})
|
297
|
-
add_datepart(df, field_name="dt", prefix="", drop=False)
|
298
|
-
df = add_cyclic_datepart(df, field_name="dt", prefix="", drop=False)
|
299
|
-
enc = torch.empty(len(datetimes), self.cont_dim)
|
300
|
-
feats_done = 0
|
301
|
-
for c, t in self.cols_types:
|
302
|
-
feats_doing = 1 if t == "float" else t
|
303
|
-
if t == "float":
|
304
|
-
feats = torch.FloatTensor(df[c].to_numpy()).view(-1, 1)
|
305
|
-
if c == "Year":
|
306
|
-
feats = (feats - 2000) / 10
|
307
|
-
elif c == "Dayofyear":
|
308
|
-
feats /= 365
|
309
|
-
else:
|
310
|
-
feats = torch.LongTensor(df[c].to_numpy().astype("int32")).view(-1, 1)
|
311
|
-
if c in ["Month", "Week", "Day"]:
|
312
|
-
feats -= 1
|
313
|
-
feats = one_hot(feats, t)
|
314
|
-
enc[:, feats_done : feats_done + feats_doing] = feats
|
315
|
-
feats_done += feats_doing
|
316
|
-
return enc
|
317
|
-
|
318
|
-
|
319
|
-
class DatetimeOrdinalEnc(EncBase):
|
320
|
-
# These are all 1 larger than you'd expect to support missing values
|
321
|
-
cols_types = [
|
322
|
-
("Month", 13),
|
323
|
-
("Week", 54),
|
324
|
-
("Day", 32),
|
325
|
-
("Dayofweek", 8),
|
326
|
-
("Is_month_end", 3),
|
327
|
-
("Is_month_start", 3),
|
328
|
-
("Is_quarter_end", 3),
|
329
|
-
("Is_quarter_start", 3),
|
330
|
-
("Is_year_end", 3),
|
331
|
-
("Is_year_start", 3),
|
332
|
-
]
|
333
|
-
cat_cards = [n for _, n in cols_types]
|
334
|
-
|
335
|
-
def enc_cat(self, datetimes):
|
336
|
-
# todo: add support for missing values, which should get encoded as 1.
|
337
|
-
datetimes = self.clean_data(datetimes)
|
338
|
-
df = pd.DataFrame({"dt": datetimes})
|
339
|
-
add_datepart(df, field_name="dt", prefix="", drop=False)
|
340
|
-
feats = []
|
341
|
-
for c, t in self.cols_types:
|
342
|
-
f = torch.LongTensor(df[c].to_numpy().astype("int32"))
|
343
|
-
if c in ["Month", "Week", "Day"]:
|
344
|
-
f -= 1
|
345
|
-
feats.append(f)
|
346
|
-
feats = torch.stack(feats, dim=1) + 2 # + 2 for missing and padding
|
347
|
-
return feats
|
348
|
-
|
349
|
-
def enc_cont(self, data):
|
350
|
-
pass
|
351
|
-
|
352
|
-
|
353
|
-
class LatLongScalarEnc(EncBase):
|
354
|
-
cont_dim = 5
|
355
|
-
|
356
|
-
def enc_cat(self, data):
|
357
|
-
pass
|
358
|
-
|
359
|
-
def enc_cont(self, latlongs):
|
360
|
-
latlongs = self.clean_data(latlongs)
|
361
|
-
if isinstance(latlongs[0], str):
|
362
|
-
fixed = []
|
363
|
-
for ll in latlongs:
|
364
|
-
lat, long = ll.strip("()").split(",")
|
365
|
-
lat, long = float(lat), float(long)
|
366
|
-
fixed.append((lat, long))
|
367
|
-
latlongs = fixed
|
368
|
-
latlongs = np.array(latlongs)
|
369
|
-
lats, longs = latlongs[:, 0:1], latlongs[:, 1:2]
|
370
|
-
x = np.cos(lats) * np.cos(longs)
|
371
|
-
y = np.cos(lats) * np.sin(longs)
|
372
|
-
z = np.sin(lats)
|
373
|
-
lats /= 90
|
374
|
-
longs /= 180
|
375
|
-
latlongs = np.hstack((lats, longs, x, y, z))
|
376
|
-
return torch.Tensor(latlongs)
|
377
|
-
|
378
|
-
|
379
|
-
class LatLongQuantileOrdinalEnc(EncBase):
|
380
|
-
def __init__(self, disc_params=None):
|
381
|
-
if disc_params is not None:
|
382
|
-
self.cat_cards = []
|
383
|
-
self.discs = self.get_new_base_enc()
|
384
|
-
for disc, (n_bins_, bin_edges_) in zip(self.discs, disc_params):
|
385
|
-
disc.disc.n_bins_ = n_bins_
|
386
|
-
disc.bin_edges_ = bin_edges_
|
387
|
-
self.cat_cards.append(n_bins_ + 2)
|
388
|
-
|
389
|
-
def fit(self, data):
|
390
|
-
data = LatLongScalarEnc().enc_cont(data)
|
391
|
-
self.cat_cards = []
|
392
|
-
self.discs = self.get_new_base_enc()
|
393
|
-
for col, disc in enumerate(self.discs):
|
394
|
-
fit_data = data[:, col].numpy().reshape(-1, 1)
|
395
|
-
disc.fit(fit_data)
|
396
|
-
self.cat_cards.append(int(disc.n_bins_ + 2))
|
397
|
-
|
398
|
-
def enc_cat(self, data):
|
399
|
-
# todo: add support for missing values, which should get encoded as 1.
|
400
|
-
data = LatLongScalarEnc().enc_cont(data)
|
401
|
-
feats = []
|
402
|
-
for col, disc in enumerate(self.discs):
|
403
|
-
d = data[:, col].reshape(-1, 1)
|
404
|
-
d = disc.transform(d).reshape(-1)
|
405
|
-
d = d + 2 # for missing and padding
|
406
|
-
feats.append(d)
|
407
|
-
feats = np.stack(feats, axis=1)
|
408
|
-
return torch.LongTensor(feats)
|
409
|
-
|
410
|
-
def enc_cont(self, data):
|
411
|
-
pass
|
412
|
-
|
413
|
-
@staticmethod
|
414
|
-
def get_new_base_enc():
|
415
|
-
return [KBinsDiscretizer(n_bins=8, encode="ordinal", strategy="quantile") for _ in range(LatLongScalarEnc.cont_dim)]
|
416
|
-
|
417
|
-
def get_base_enc_params(self):
|
418
|
-
return [(disc.n_bins_, disc.bin_edges_) for disc in self.discs]
|
419
|
-
|
420
|
-
|
421
|
-
class TfidfEnc(EncBase):
|
422
|
-
def __init__(self, vocabulary_=None, idf_=None):
|
423
|
-
if vocabulary_ is not None and idf_ is not None:
|
424
|
-
self.tfidf = self.get_new_base_enc()
|
425
|
-
self.tfidf.vocabulary_ = vocabulary_
|
426
|
-
self.tfidf.idf_ = np.array(idf_)
|
427
|
-
self.cont_dim = len(vocabulary_)
|
428
|
-
|
429
|
-
def enc_cat(self, data):
|
430
|
-
pass
|
431
|
-
|
432
|
-
def enc_cont(self, data):
|
433
|
-
data = self.clean_data(data)
|
434
|
-
text_strings = np.array([d if d is not None else "" for d in data])
|
435
|
-
encoded = self.tfidf.transform(text_strings)
|
436
|
-
encoded = torch.Tensor(encoded.todense())
|
437
|
-
# todo: wait until pytorch lets you use multiproc with sparse tensors
|
438
|
-
# encoded = encoded.tocoo()
|
439
|
-
# i = torch.LongTensor(np.vstack((encoded.row, encoded.col)))
|
440
|
-
# v = torch.FloatTensor(encoded.data)
|
441
|
-
# encoded = torch.sparse.FloatTensor(i, v, torch.Size(encoded.shape))
|
442
|
-
return encoded
|
443
|
-
|
444
|
-
def fit(self, data):
|
445
|
-
data = super().fit(data)
|
446
|
-
data = [d if d is not None else "" for d in data]
|
447
|
-
self.tfidf = self.get_new_base_enc().fit(data)
|
448
|
-
self.cont_dim = len(self.tfidf.vocabulary_)
|
449
|
-
|
450
|
-
@staticmethod
|
451
|
-
def get_new_base_enc():
|
452
|
-
return TfidfVectorizer(input="content", decode_error="replace", strip_accents="ascii", lowercase=True, analyzer="word", min_df=5 / 100000)
|
453
|
-
|
454
|
-
def get_base_enc_params(self):
|
455
|
-
return self.tfidf.vocabulary_, self.tfidf.idf_
|
456
|
-
|
457
|
-
|
458
|
-
class TextSummaryScalarEnc(EncBase):
|
459
|
-
"""
|
460
|
-
Returns the featuretools summary statistics about the text (num words and num_chars), but normalized
|
461
|
-
"""
|
462
|
-
|
463
|
-
cont_dim = 2
|
464
|
-
|
465
|
-
def __init__(self, center_=None, scale_=None):
|
466
|
-
if center_ is not None and scale_ is not None:
|
467
|
-
self.scaler = RobustScaler()
|
468
|
-
self.scaler.center_ = center_
|
469
|
-
self.scaler.scale_ = scale_
|
470
|
-
|
471
|
-
def enc_cat(self, data):
|
472
|
-
pass
|
473
|
-
|
474
|
-
def enc_cont(self, data):
|
475
|
-
data = self.clean_data(data)
|
476
|
-
text_strings = [s if s is not None else "" for s in data]
|
477
|
-
encoded = self.get_encoded(text_strings)
|
478
|
-
encoded = self.scaler.transform(encoded)
|
479
|
-
encoded = torch.Tensor(encoded)
|
480
|
-
return encoded
|
481
|
-
|
482
|
-
def get_encoded(self, text_strings):
|
483
|
-
text_strings = [ts if ts is not None else "" for ts in text_strings]
|
484
|
-
num_chars = [len(ts) for ts in text_strings]
|
485
|
-
num_words = [len(ts.split()) for ts in text_strings]
|
486
|
-
return np.array((num_chars, num_words)).T
|
487
|
-
|
488
|
-
def fit(self, data):
|
489
|
-
data = super().fit(data)
|
490
|
-
encoded = self.get_encoded(data)
|
491
|
-
self.scaler = RobustScaler().fit(encoded)
|
492
|
-
|
493
|
-
def get_base_enc_params(self):
|
494
|
-
return self.scaler.center_, self.scaler.scale_
|
495
|
-
|
496
|
-
|
497
|
-
class EmbeddingInitializer(nn.Module):
|
498
|
-
def __init__(
|
499
|
-
self,
|
500
|
-
num_embeddings,
|
501
|
-
max_emb_dim,
|
502
|
-
p_dropout,
|
503
|
-
minimize_emb_dim=True,
|
504
|
-
drop_whole_embeddings=False,
|
505
|
-
one_hot=False,
|
506
|
-
out_dim=None,
|
507
|
-
shared_embedding=False,
|
508
|
-
n_shared_embs=8,
|
509
|
-
shared_embedding_added=False,
|
510
|
-
):
|
511
|
-
"""
|
512
|
-
:param minimize_emb_dim:
|
513
|
-
Whether to set embedding_dim = max_emb_dim or to make embedding_dim smaller is num_embeddings is small
|
514
|
-
:param drop_whole_embeddings:
|
515
|
-
If True, dropout pretends the embedding was a missing value. If false, dropout sets embed features to 0
|
516
|
-
:param one_hot:
|
517
|
-
If True, one-hot encode variables whose cardinality is < max_emb_dim. Also, set requires_grad = False
|
518
|
-
:param out_dim:
|
519
|
-
If None, return the embedding straight from self.embed. If another dimension, put the embedding through a
|
520
|
-
Linear layer to make it size (batch x out_dim).
|
521
|
-
:param shared_embedding:
|
522
|
-
If True, 1/(n_shared_embs)th of every embedding will be reserved for a learned parameter that's common to all embeddings.
|
523
|
-
This is useful for transformers to identify which column an embedding came from.
|
524
|
-
Mutually exclusive with one_hot.
|
525
|
-
|
526
|
-
Note: the 0 embedding is reserved for padding and masking. The various encoders use 1 for missing values.
|
527
|
-
|
528
|
-
"""
|
529
|
-
super().__init__()
|
530
|
-
assert not (one_hot and out_dim is not None)
|
531
|
-
self.p_dropout = p_dropout
|
532
|
-
self.drop_whole_embeddings = drop_whole_embeddings
|
533
|
-
self.shared_embedding = shared_embedding
|
534
|
-
self.shared_embedding_added = shared_embedding_added
|
535
|
-
if minimize_emb_dim or one_hot:
|
536
|
-
self.emb_dim = min(max_emb_dim, num_embeddings) # Don't use a crazy huge embedding if not needed
|
537
|
-
else:
|
538
|
-
self.emb_dim = max_emb_dim
|
539
|
-
self.reshape_out = nn.Identity()
|
540
|
-
if out_dim is not None:
|
541
|
-
assert self.emb_dim <= out_dim, "Makes no sense: just set max_emb_dim = out_dim and out_dim = None"
|
542
|
-
if num_embeddings > self.emb_dim:
|
543
|
-
self.reshape_out = nn.Linear(self.emb_dim, out_dim, bias=True)
|
544
|
-
else:
|
545
|
-
self.emb_dim = out_dim
|
546
|
-
# Note: if you change the name of self.embed, or initialize an embedding elsewhere in a model,
|
547
|
-
# the function get_optim will not work properly
|
548
|
-
self.embed = nn.Embedding(num_embeddings=num_embeddings + 1, embedding_dim=self.emb_dim, padding_idx=0)
|
549
|
-
self.embed.weight.data.clamp_(-2, 2) # Use truncated normal init
|
550
|
-
if one_hot:
|
551
|
-
self.embed.weight.requires_grad = False
|
552
|
-
if num_embeddings <= max_emb_dim:
|
553
|
-
self.embed.weight.data[1:, :] = torch.eye(self.emb_dim)
|
554
|
-
if shared_embedding:
|
555
|
-
assert not one_hot
|
556
|
-
ce_dim = self.emb_dim if shared_embedding_added else (out_dim if out_dim else self.emb_dim) // n_shared_embs # used to be //8
|
557
|
-
self.shared_emb = nn.Parameter(torch.empty(1, ce_dim).uniform_(-1, 1))
|
558
|
-
self.do = nn.Dropout(p=p_dropout)
|
559
|
-
|
560
|
-
def forward(self, input):
|
561
|
-
if self.drop_whole_embeddings and self.training:
|
562
|
-
mask = torch.zeros_like(input).bernoulli_(1 - self.p_dropout)
|
563
|
-
input = input * mask
|
564
|
-
out = self.embed(input)
|
565
|
-
if not self.drop_whole_embeddings:
|
566
|
-
out = self.do(out)
|
567
|
-
out = self.reshape_out(out)
|
568
|
-
if self.shared_embedding:
|
569
|
-
shared_emb = self.shared_emb.expand(out.shape[0], -1)
|
570
|
-
if not self.shared_embedding_added:
|
571
|
-
out[:, : shared_emb.shape[1]] = shared_emb
|
572
|
-
else:
|
573
|
-
out += shared_emb
|
574
|
-
return out
|
575
|
-
|
576
|
-
|
577
|
-
def one_hot(x, card):
|
578
|
-
assert isinstance(x, torch.LongTensor)
|
579
|
-
assert x.dim() == 2
|
580
|
-
x_one_hot = x.new_zeros(x.size()[0], card).scatter_(1, x, 1)
|
581
|
-
return x_one_hot
|
582
|
-
|
583
|
-
|
584
|
-
"""
|
585
|
-
These functions stolen wholesale, with much gratitude, from
|
586
|
-
https://github.com/fastai/fastai/blob/master/fastai/tabular/transform.py
|
587
|
-
"""
|
588
|
-
|
589
|
-
|
590
|
-
def make_date(df: DataFrame, date_field: str):
|
591
|
-
"Make sure `df[field_name]` is of the right date type."
|
592
|
-
field_dtype = df[date_field].dtype
|
593
|
-
if isinstance(field_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
|
594
|
-
field_dtype = np.datetime64
|
595
|
-
if not np.issubdtype(field_dtype, np.datetime64):
|
596
|
-
df[date_field] = pd.to_datetime(df[date_field], infer_datetime_format=True, format="mixed")
|
597
|
-
|
598
|
-
|
599
|
-
def add_datepart(df: DataFrame, field_name: str, prefix: str = None, drop: bool = True, time: bool = False):
|
600
|
-
"Helper function that adds columns relevant to a date in the column `field_name` of `df`."
|
601
|
-
make_date(df, field_name)
|
602
|
-
field = df[field_name]
|
603
|
-
prefix = re.sub("[Dd]ate$", "", field_name) if prefix is None else prefix
|
604
|
-
attr = [
|
605
|
-
"Year",
|
606
|
-
"Month",
|
607
|
-
"Week",
|
608
|
-
"Day",
|
609
|
-
"Dayofweek",
|
610
|
-
"Dayofyear",
|
611
|
-
"Is_month_end",
|
612
|
-
"Is_month_start",
|
613
|
-
"Is_quarter_end",
|
614
|
-
"Is_quarter_start",
|
615
|
-
"Is_year_end",
|
616
|
-
"Is_year_start",
|
617
|
-
]
|
618
|
-
if time:
|
619
|
-
attr = attr + ["Hour", "Minute", "Second"]
|
620
|
-
for n in attr:
|
621
|
-
df[prefix + n] = getattr(field.dt, n.lower())
|
622
|
-
if drop:
|
623
|
-
df.drop(field_name, axis=1, inplace=True)
|
624
|
-
return df
|
625
|
-
|
626
|
-
|
627
|
-
def cyclic_dt_feat_names(time: bool = True, add_linear: bool = False) -> list[str]:
|
628
|
-
"Return feature names of date/time cycles as produced by `cyclic_dt_features`."
|
629
|
-
fs = ["cos", "sin"]
|
630
|
-
attr = [f"{r}_{f}" for r in "weekday day_month month_year day_year".split() for f in fs]
|
631
|
-
if time:
|
632
|
-
attr += [f"{r}_{f}" for r in "hour clock min sec".split() for f in fs]
|
633
|
-
if add_linear:
|
634
|
-
attr.append("year_lin")
|
635
|
-
return attr
|
636
|
-
|
637
|
-
|
638
|
-
def cyclic_dt_features(d: Union[date, datetime], time: bool = True, add_linear: bool = False) -> list[float]:
|
639
|
-
"Calculate the cos and sin of date/time cycles."
|
640
|
-
tt, fs = d.timetuple(), [np.cos, np.sin]
|
641
|
-
day_year, days_month = tt.tm_yday, calendar.monthrange(d.year, d.month)[1]
|
642
|
-
days_year = 366 if calendar.isleap(d.year) else 365
|
643
|
-
rs = d.weekday() / 7, (d.day - 1) / days_month, (d.month - 1) / 12, (day_year - 1) / days_year
|
644
|
-
feats = [f(r * 2 * np.pi) for r in rs for f in fs]
|
645
|
-
if time and isinstance(d, datetime) and type(d) != date:
|
646
|
-
rs = tt.tm_hour / 24, tt.tm_hour % 12 / 12, tt.tm_min / 60, tt.tm_sec / 60
|
647
|
-
feats += [f(r * 2 * np.pi) for r in rs for f in fs]
|
648
|
-
if add_linear:
|
649
|
-
if type(d) == date:
|
650
|
-
feats.append(d.year + rs[-1])
|
651
|
-
else:
|
652
|
-
secs_in_year = (datetime(d.year + 1, 1, 1) - datetime(d.year, 1, 1)).total_seconds()
|
653
|
-
feats.append(d.year + ((d - datetime(d.year, 1, 1)).total_seconds() / secs_in_year))
|
654
|
-
return feats
|
655
|
-
|
656
|
-
|
657
|
-
def add_cyclic_datepart(df: DataFrame, field_name: str, prefix: str = None, drop: bool = True, time: bool = False, add_linear: bool = False):
|
658
|
-
"Helper function that adds trigonometric date/time features to a date in the column `field_name` of `df`."
|
659
|
-
make_date(df, field_name)
|
660
|
-
field = df[field_name]
|
661
|
-
prefix = re.sub("[Dd]ate$", "", field_name) if prefix is None else prefix
|
662
|
-
series = field.apply(partial(cyclic_dt_features, time=time, add_linear=add_linear))
|
663
|
-
columns = [prefix + c for c in cyclic_dt_feat_names(time, add_linear)]
|
664
|
-
df_feats = pd.DataFrame([item for item in series], columns=columns)
|
665
|
-
df = pd.concat([df, df_feats], axis=1)
|
666
|
-
if drop:
|
667
|
-
df.drop(field_name, axis=1, inplace=True)
|
668
|
-
return df
|