autogluon.tabular 1.2.1b20250407__py3-none-any.whl → 1.2.1b20250409__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. autogluon/tabular/register/_ag_model_register.py +0 -2
  2. autogluon/tabular/version.py +1 -1
  3. {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/METADATA +13 -13
  4. {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/RECORD +11 -22
  5. autogluon/tabular/models/tab_transformer/__init__.py +0 -1
  6. autogluon/tabular/models/tab_transformer/hyperparameters/__init__.py +0 -1
  7. autogluon/tabular/models/tab_transformer/hyperparameters/parameters.py +0 -66
  8. autogluon/tabular/models/tab_transformer/hyperparameters/searchspaces.py +0 -17
  9. autogluon/tabular/models/tab_transformer/modified_transformer.py +0 -494
  10. autogluon/tabular/models/tab_transformer/pretexts.py +0 -150
  11. autogluon/tabular/models/tab_transformer/tab_model_base.py +0 -86
  12. autogluon/tabular/models/tab_transformer/tab_transformer.py +0 -183
  13. autogluon/tabular/models/tab_transformer/tab_transformer_encoder.py +0 -668
  14. autogluon/tabular/models/tab_transformer/tab_transformer_model.py +0 -540
  15. autogluon/tabular/models/tab_transformer/utils.py +0 -124
  16. /autogluon.tabular-1.2.1b20250407-py3.9-nspkg.pth → /autogluon.tabular-1.2.1b20250409-py3.9-nspkg.pth +0 -0
  17. {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/LICENSE +0 -0
  18. {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/NOTICE +0 -0
  19. {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/WHEEL +0 -0
  20. {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/namespace_packages.txt +0 -0
  21. {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/top_level.txt +0 -0
  22. {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/zip-safe +0 -0
@@ -1,668 +0,0 @@
1
- # Credits for code in this script to Milan Cvitkovic,
2
- # Xin Huang, Ashish Khetan and Zohar Karnin
3
-
4
- import calendar
5
- import datetime
6
- import re
7
- from collections import Counter
8
- from datetime import date, datetime
9
- from functools import partial
10
- from typing import Iterable, Union
11
-
12
- import numpy as np
13
- import pandas as pd
14
- import torch
15
- import torch.nn as nn
16
- from pandas import DataFrame
17
- from sklearn.feature_extraction.text import TfidfVectorizer
18
- from sklearn.preprocessing import KBinsDiscretizer, PowerTransformer, QuantileTransformer, RobustScaler
19
-
20
-
21
- class WontEncodeError(Exception):
22
- pass
23
-
24
-
25
- class EncBase:
26
- cat_cards = []
27
- cont_dim = 0
28
-
29
- @property
30
- def cat_dim(self):
31
- return len(self.cat_cards)
32
-
33
- def clean_data(self, data, dtype=None) -> list:
34
- if isinstance(data, pd.Series):
35
- data = data.replace({np.nan: None}).to_list()
36
- if dtype == "float":
37
- unclean_data = data
38
- data = []
39
- for i in unclean_data:
40
- try:
41
- data.append(float(i))
42
- except (ValueError, TypeError):
43
- data.append(None)
44
- return data
45
-
46
- def fit(self, data: pd.Series, dtype=None):
47
- """
48
- If dtype == 'float', clean_data will cast the contents of data to floats
49
- """
50
- if len(pd.unique(data)) == 1:
51
- raise WontEncodeError("Column contains only one value")
52
- data = self.clean_data(data, dtype)
53
- return data
54
-
55
- def enc_cat(self, data: Iterable):
56
- raise NotImplementedError
57
-
58
- def enc_cont(self, data: Iterable):
59
- raise NotImplementedError
60
-
61
-
62
- class NullEnc(EncBase):
63
- """
64
- When you want to ignore a feature
65
- """
66
-
67
- def fit(self, data: Iterable):
68
- pass
69
-
70
- def enc_cat(self, data: Iterable):
71
- pass
72
-
73
- def enc_cont(self, data: Iterable):
74
- pass
75
-
76
-
77
- class CategoricalOrdinalEnc(EncBase):
78
- def __init__(self, sorted_items=None):
79
- if sorted_items is not None:
80
- assert sorted_items == sorted(sorted_items)
81
- self.init_with_sorted_values(sorted_items)
82
-
83
- def fit(self, data: pd.Series):
84
- data = super().fit(data)
85
- distinct_vals = [i for i in pd.unique(data) if i is not None]
86
- sorted_vals = sorted(distinct_vals)
87
- if len(sorted_vals) >= 0.5 * len(data) or max(Counter(data).values()) < 10: # sorry for hardcoding this...
88
- raise WontEncodeError("Too many unique values to bother embedding")
89
- self.init_with_sorted_values(sorted_vals)
90
-
91
- def init_with_sorted_values(self, sorted_vals):
92
- self._item_to_idx = {item: idx for idx, item in enumerate(sorted_vals, start=2)}
93
- self._item_to_idx[None] = 1
94
- self._item_to_idx[np.nan] = 1
95
- self.cat_cards = [len(set(self._item_to_idx.values()))]
96
-
97
- def enc_cat(self, data):
98
- """
99
- Values that the encoder has never seen before are returned as 1. 0 is reserved for padding.
100
- """
101
- data = self.clean_data(data)
102
- idxs = [self._item_to_idx.get(item, 1) for item in data]
103
- return torch.LongTensor(idxs).unsqueeze(1)
104
-
105
- def enc_cont(self, data):
106
- pass
107
-
108
-
109
- class ScalarQuantileOrdinalEnc(EncBase):
110
- def __init__(self, n_bins_=None, bin_edges_=None):
111
- if n_bins_ is not None and bin_edges_ is not None:
112
- self.disc = self.get_new_base_enc()
113
- self.disc.n_bins_ = np.array([n_bins_])
114
- self.disc.bin_edges_ = np.array([np.array(bin_edges_), np.array(bin_edges_[:-1])])[:1] # Dumb hack, but it's what sklearn needs
115
- self.cat_cards = [n_bins_ + 1]
116
-
117
- def fit(self, data):
118
- data = super().fit(data, dtype="float")
119
- fit_data = [i for i in data if i is not None]
120
- fit_data = np.array(fit_data).reshape(-1, 1)
121
- self.disc = self.get_new_base_enc()
122
- self.disc.fit(fit_data)
123
- self.cat_cards = [self.disc.n_bins_.item() + 1]
124
-
125
- def enc_cat(self, data):
126
- """
127
- Missing values are returned as category 1. 0 is reserved for padding.
128
- """
129
- data = self.clean_data(data, dtype="float")
130
- data = np.array(data).reshape(-1, 1)
131
- if None in data:
132
- idxs = np.full(len(data), -1, dtype=int)
133
- null_idxs = np.where(data == None)[0]
134
- val_idxs = np.where(data != None)[0]
135
- if len(val_idxs) > 0:
136
- vals = self.disc.transform(data[val_idxs]).reshape(-1)
137
- idxs[val_idxs] = vals + 2
138
- idxs[null_idxs] = 1
139
- else:
140
- idxs = self.disc.transform(data).reshape(-1) + 2
141
- return torch.LongTensor(idxs).unsqueeze(1)
142
-
143
- def enc_cont(self, data):
144
- pass
145
-
146
- @staticmethod
147
- def get_new_base_enc():
148
- return KBinsDiscretizer(n_bins=8, encode="ordinal", strategy="quantile")
149
-
150
- def get_base_enc_params(self):
151
- return self.disc.n_bins_, self.disc.bin_edges_
152
-
153
-
154
- class ScalarRescaleEnc(EncBase):
155
- cont_dim = 2
156
- scaler = None
157
-
158
- def enc_cat(self, data):
159
- pass
160
-
161
- def enc_cont(self, scalars):
162
- """
163
- Returns len(scalars) x 2 tensor, where the second column is a one-hot flag for missing data values
164
- """
165
- scalars = self.clean_data(scalars, dtype="float")
166
- null_flag = np.full(len(scalars), np.nan, dtype=np.float32)
167
- vals = np.full(len(scalars), np.nan, dtype=np.float32)
168
- null_idxs = np.where(np.array(scalars) == None)[0]
169
- val_idxs = np.where(np.array(scalars) != None)[0]
170
-
171
- # One-hot flag for missing values
172
- null_flag[null_idxs] = 1
173
- null_flag[val_idxs] = 0
174
- null_flag = null_flag.reshape(-1, 1)
175
-
176
- # Transform scalar values
177
- vals[val_idxs] = np.array(scalars, dtype=np.float32)[val_idxs]
178
- vals = vals.reshape(-1, 1)
179
- vals = self.scaler.transform(vals) + 1e-7 # Extra 1e-7 to help with correctness testing
180
- vals[null_idxs] = 0
181
-
182
- encoded = np.hstack((vals, null_flag))
183
- encoded = encoded.clip(-5, 5) # Guarding against outlier values
184
- return torch.FloatTensor(encoded)
185
-
186
- @staticmethod
187
- def get_new_base_enc():
188
- raise NotImplementedError
189
-
190
- def get_base_enc_params(self):
191
- raise NotImplementedError
192
-
193
-
194
- class ScalarRobustScalerEnc(ScalarRescaleEnc):
195
- def __init__(self, center_=None, scale_=None):
196
- if center_ is not None and scale_ is not None:
197
- self.scaler = self.get_new_base_enc()
198
- self.scaler.center_ = center_
199
- self.scaler.scale_ = scale_
200
-
201
- def fit(self, data: pd.Series):
202
- data = super().fit(data, dtype="float")
203
- data = np.array(data).reshape(-1, 1)
204
- self.scaler = self.get_new_base_enc()
205
- self.scaler.fit(data)
206
- if any(sum(np.isnan(p) for p in self.get_base_enc_params())):
207
- self.scaler.center_ = 0
208
- self.scaler.scale_ = 1
209
-
210
- @staticmethod
211
- def get_new_base_enc():
212
- return RobustScaler()
213
-
214
- def get_base_enc_params(self):
215
- return self.scaler.center_, self.scaler.scale_
216
-
217
-
218
- class ScalarPowerTransformerEnc(ScalarRescaleEnc):
219
- def __init__(self, lambdas_=None, scale_=None, mean_=None, var_=None, n_samples_seen_=None):
220
- if all(a is not None for a in [lambdas_, scale_, mean_, var_, n_samples_seen_]):
221
- self.scaler = self.get_new_base_enc()
222
- self.scaler.fit([[0.0]]) # This is just to make the PowerTransformer initialize before we overwrite its params
223
- self.scaler.lambdas_ = np.array([lambdas_])
224
- self.scaler._scaler.scale_ = np.array([scale_])
225
- self.scaler._scaler.mean_ = np.array([mean_])
226
- self.scaler._scaler.var_ = np.array([var_])
227
- self.scaler._scaler.n_samples_seen_ = n_samples_seen_
228
-
229
- def fit(self, data):
230
- data = super().fit(data, dtype="float")
231
- data = np.array(data).reshape(-1, 1)
232
- self.scaler = self.get_new_base_enc()
233
- self.scaler.fit(data)
234
-
235
- @staticmethod
236
- def get_new_base_enc():
237
- return PowerTransformer(method="yeo-johnson", standardize=True, copy=True)
238
-
239
- def get_base_enc_params(self):
240
- return self.scaler.lambdas_, self.scaler._scaler.scale_, self.scaler._scaler.mean_, self.scaler._scaler.var_, self.scaler._scaler.n_samples_seen_
241
-
242
-
243
- class ScalarQuantileTransformerEnc(ScalarRescaleEnc):
244
- def __init__(self, n_quantiles_=None, quantiles_=None, references_=None):
245
- if all(a is not None for a in [n_quantiles_, quantiles_, references_]):
246
- self.scaler = self.get_new_base_enc()
247
- self.scaler.n_quantiles_ = n_quantiles_
248
- self.scaler.quantiles_ = np.array(quantiles_).reshape(-1, 1)
249
- self.scaler.references_ = np.array(references_)
250
-
251
- def fit(self, data):
252
- data = super().fit(data, dtype="float")
253
- data = np.array(data).reshape(-1, 1)
254
- self.scaler = self.get_new_base_enc()
255
- self.scaler.fit(data)
256
-
257
- @staticmethod
258
- def get_new_base_enc():
259
- return QuantileTransformer()
260
-
261
- def get_base_enc_params(self):
262
- return self.scaler.n_quantiles_, self.scaler.quantiles_, self.scaler.references_
263
-
264
-
265
- class DatetimeScalarEnc(EncBase):
266
- # int for type refers to the cardinality of the one-hot
267
- cols_types = [
268
- ("Year", "float"),
269
- ("Month", 12),
270
- ("Week", 53),
271
- ("Day", 31),
272
- ("Dayofweek", 7),
273
- ("Dayofyear", "float"),
274
- ("Is_month_end", 2),
275
- ("Is_month_start", 2),
276
- ("Is_quarter_end", 2),
277
- ("Is_quarter_start", 2),
278
- ("Is_year_end", 2),
279
- ("Is_year_start", 2),
280
- ("weekday_cos", "float"),
281
- ("weekday_sin", "float"),
282
- ("day_month_cos", "float"),
283
- ("day_month_sin", "float"),
284
- ("month_year_cos", "float"),
285
- ("month_year_sin", "float"),
286
- ("day_year_cos", "float"),
287
- ("day_year_sin", "float"),
288
- ]
289
- cont_dim = sum([n if type(n) == int else 1 for _, n in cols_types])
290
-
291
- def enc_cat(self, data):
292
- pass
293
-
294
- def enc_cont(self, datetimes):
295
- datetimes = self.clean_data(datetimes)
296
- df = pd.DataFrame({"dt": datetimes})
297
- add_datepart(df, field_name="dt", prefix="", drop=False)
298
- df = add_cyclic_datepart(df, field_name="dt", prefix="", drop=False)
299
- enc = torch.empty(len(datetimes), self.cont_dim)
300
- feats_done = 0
301
- for c, t in self.cols_types:
302
- feats_doing = 1 if t == "float" else t
303
- if t == "float":
304
- feats = torch.FloatTensor(df[c].to_numpy()).view(-1, 1)
305
- if c == "Year":
306
- feats = (feats - 2000) / 10
307
- elif c == "Dayofyear":
308
- feats /= 365
309
- else:
310
- feats = torch.LongTensor(df[c].to_numpy().astype("int32")).view(-1, 1)
311
- if c in ["Month", "Week", "Day"]:
312
- feats -= 1
313
- feats = one_hot(feats, t)
314
- enc[:, feats_done : feats_done + feats_doing] = feats
315
- feats_done += feats_doing
316
- return enc
317
-
318
-
319
- class DatetimeOrdinalEnc(EncBase):
320
- # These are all 1 larger than you'd expect to support missing values
321
- cols_types = [
322
- ("Month", 13),
323
- ("Week", 54),
324
- ("Day", 32),
325
- ("Dayofweek", 8),
326
- ("Is_month_end", 3),
327
- ("Is_month_start", 3),
328
- ("Is_quarter_end", 3),
329
- ("Is_quarter_start", 3),
330
- ("Is_year_end", 3),
331
- ("Is_year_start", 3),
332
- ]
333
- cat_cards = [n for _, n in cols_types]
334
-
335
- def enc_cat(self, datetimes):
336
- # todo: add support for missing values, which should get encoded as 1.
337
- datetimes = self.clean_data(datetimes)
338
- df = pd.DataFrame({"dt": datetimes})
339
- add_datepart(df, field_name="dt", prefix="", drop=False)
340
- feats = []
341
- for c, t in self.cols_types:
342
- f = torch.LongTensor(df[c].to_numpy().astype("int32"))
343
- if c in ["Month", "Week", "Day"]:
344
- f -= 1
345
- feats.append(f)
346
- feats = torch.stack(feats, dim=1) + 2 # + 2 for missing and padding
347
- return feats
348
-
349
- def enc_cont(self, data):
350
- pass
351
-
352
-
353
- class LatLongScalarEnc(EncBase):
354
- cont_dim = 5
355
-
356
- def enc_cat(self, data):
357
- pass
358
-
359
- def enc_cont(self, latlongs):
360
- latlongs = self.clean_data(latlongs)
361
- if isinstance(latlongs[0], str):
362
- fixed = []
363
- for ll in latlongs:
364
- lat, long = ll.strip("()").split(",")
365
- lat, long = float(lat), float(long)
366
- fixed.append((lat, long))
367
- latlongs = fixed
368
- latlongs = np.array(latlongs)
369
- lats, longs = latlongs[:, 0:1], latlongs[:, 1:2]
370
- x = np.cos(lats) * np.cos(longs)
371
- y = np.cos(lats) * np.sin(longs)
372
- z = np.sin(lats)
373
- lats /= 90
374
- longs /= 180
375
- latlongs = np.hstack((lats, longs, x, y, z))
376
- return torch.Tensor(latlongs)
377
-
378
-
379
- class LatLongQuantileOrdinalEnc(EncBase):
380
- def __init__(self, disc_params=None):
381
- if disc_params is not None:
382
- self.cat_cards = []
383
- self.discs = self.get_new_base_enc()
384
- for disc, (n_bins_, bin_edges_) in zip(self.discs, disc_params):
385
- disc.disc.n_bins_ = n_bins_
386
- disc.bin_edges_ = bin_edges_
387
- self.cat_cards.append(n_bins_ + 2)
388
-
389
- def fit(self, data):
390
- data = LatLongScalarEnc().enc_cont(data)
391
- self.cat_cards = []
392
- self.discs = self.get_new_base_enc()
393
- for col, disc in enumerate(self.discs):
394
- fit_data = data[:, col].numpy().reshape(-1, 1)
395
- disc.fit(fit_data)
396
- self.cat_cards.append(int(disc.n_bins_ + 2))
397
-
398
- def enc_cat(self, data):
399
- # todo: add support for missing values, which should get encoded as 1.
400
- data = LatLongScalarEnc().enc_cont(data)
401
- feats = []
402
- for col, disc in enumerate(self.discs):
403
- d = data[:, col].reshape(-1, 1)
404
- d = disc.transform(d).reshape(-1)
405
- d = d + 2 # for missing and padding
406
- feats.append(d)
407
- feats = np.stack(feats, axis=1)
408
- return torch.LongTensor(feats)
409
-
410
- def enc_cont(self, data):
411
- pass
412
-
413
- @staticmethod
414
- def get_new_base_enc():
415
- return [KBinsDiscretizer(n_bins=8, encode="ordinal", strategy="quantile") for _ in range(LatLongScalarEnc.cont_dim)]
416
-
417
- def get_base_enc_params(self):
418
- return [(disc.n_bins_, disc.bin_edges_) for disc in self.discs]
419
-
420
-
421
- class TfidfEnc(EncBase):
422
- def __init__(self, vocabulary_=None, idf_=None):
423
- if vocabulary_ is not None and idf_ is not None:
424
- self.tfidf = self.get_new_base_enc()
425
- self.tfidf.vocabulary_ = vocabulary_
426
- self.tfidf.idf_ = np.array(idf_)
427
- self.cont_dim = len(vocabulary_)
428
-
429
- def enc_cat(self, data):
430
- pass
431
-
432
- def enc_cont(self, data):
433
- data = self.clean_data(data)
434
- text_strings = np.array([d if d is not None else "" for d in data])
435
- encoded = self.tfidf.transform(text_strings)
436
- encoded = torch.Tensor(encoded.todense())
437
- # todo: wait until pytorch lets you use multiproc with sparse tensors
438
- # encoded = encoded.tocoo()
439
- # i = torch.LongTensor(np.vstack((encoded.row, encoded.col)))
440
- # v = torch.FloatTensor(encoded.data)
441
- # encoded = torch.sparse.FloatTensor(i, v, torch.Size(encoded.shape))
442
- return encoded
443
-
444
- def fit(self, data):
445
- data = super().fit(data)
446
- data = [d if d is not None else "" for d in data]
447
- self.tfidf = self.get_new_base_enc().fit(data)
448
- self.cont_dim = len(self.tfidf.vocabulary_)
449
-
450
- @staticmethod
451
- def get_new_base_enc():
452
- return TfidfVectorizer(input="content", decode_error="replace", strip_accents="ascii", lowercase=True, analyzer="word", min_df=5 / 100000)
453
-
454
- def get_base_enc_params(self):
455
- return self.tfidf.vocabulary_, self.tfidf.idf_
456
-
457
-
458
- class TextSummaryScalarEnc(EncBase):
459
- """
460
- Returns the featuretools summary statistics about the text (num words and num_chars), but normalized
461
- """
462
-
463
- cont_dim = 2
464
-
465
- def __init__(self, center_=None, scale_=None):
466
- if center_ is not None and scale_ is not None:
467
- self.scaler = RobustScaler()
468
- self.scaler.center_ = center_
469
- self.scaler.scale_ = scale_
470
-
471
- def enc_cat(self, data):
472
- pass
473
-
474
- def enc_cont(self, data):
475
- data = self.clean_data(data)
476
- text_strings = [s if s is not None else "" for s in data]
477
- encoded = self.get_encoded(text_strings)
478
- encoded = self.scaler.transform(encoded)
479
- encoded = torch.Tensor(encoded)
480
- return encoded
481
-
482
- def get_encoded(self, text_strings):
483
- text_strings = [ts if ts is not None else "" for ts in text_strings]
484
- num_chars = [len(ts) for ts in text_strings]
485
- num_words = [len(ts.split()) for ts in text_strings]
486
- return np.array((num_chars, num_words)).T
487
-
488
- def fit(self, data):
489
- data = super().fit(data)
490
- encoded = self.get_encoded(data)
491
- self.scaler = RobustScaler().fit(encoded)
492
-
493
- def get_base_enc_params(self):
494
- return self.scaler.center_, self.scaler.scale_
495
-
496
-
497
- class EmbeddingInitializer(nn.Module):
498
- def __init__(
499
- self,
500
- num_embeddings,
501
- max_emb_dim,
502
- p_dropout,
503
- minimize_emb_dim=True,
504
- drop_whole_embeddings=False,
505
- one_hot=False,
506
- out_dim=None,
507
- shared_embedding=False,
508
- n_shared_embs=8,
509
- shared_embedding_added=False,
510
- ):
511
- """
512
- :param minimize_emb_dim:
513
- Whether to set embedding_dim = max_emb_dim or to make embedding_dim smaller is num_embeddings is small
514
- :param drop_whole_embeddings:
515
- If True, dropout pretends the embedding was a missing value. If false, dropout sets embed features to 0
516
- :param one_hot:
517
- If True, one-hot encode variables whose cardinality is < max_emb_dim. Also, set requires_grad = False
518
- :param out_dim:
519
- If None, return the embedding straight from self.embed. If another dimension, put the embedding through a
520
- Linear layer to make it size (batch x out_dim).
521
- :param shared_embedding:
522
- If True, 1/(n_shared_embs)th of every embedding will be reserved for a learned parameter that's common to all embeddings.
523
- This is useful for transformers to identify which column an embedding came from.
524
- Mutually exclusive with one_hot.
525
-
526
- Note: the 0 embedding is reserved for padding and masking. The various encoders use 1 for missing values.
527
-
528
- """
529
- super().__init__()
530
- assert not (one_hot and out_dim is not None)
531
- self.p_dropout = p_dropout
532
- self.drop_whole_embeddings = drop_whole_embeddings
533
- self.shared_embedding = shared_embedding
534
- self.shared_embedding_added = shared_embedding_added
535
- if minimize_emb_dim or one_hot:
536
- self.emb_dim = min(max_emb_dim, num_embeddings) # Don't use a crazy huge embedding if not needed
537
- else:
538
- self.emb_dim = max_emb_dim
539
- self.reshape_out = nn.Identity()
540
- if out_dim is not None:
541
- assert self.emb_dim <= out_dim, "Makes no sense: just set max_emb_dim = out_dim and out_dim = None"
542
- if num_embeddings > self.emb_dim:
543
- self.reshape_out = nn.Linear(self.emb_dim, out_dim, bias=True)
544
- else:
545
- self.emb_dim = out_dim
546
- # Note: if you change the name of self.embed, or initialize an embedding elsewhere in a model,
547
- # the function get_optim will not work properly
548
- self.embed = nn.Embedding(num_embeddings=num_embeddings + 1, embedding_dim=self.emb_dim, padding_idx=0)
549
- self.embed.weight.data.clamp_(-2, 2) # Use truncated normal init
550
- if one_hot:
551
- self.embed.weight.requires_grad = False
552
- if num_embeddings <= max_emb_dim:
553
- self.embed.weight.data[1:, :] = torch.eye(self.emb_dim)
554
- if shared_embedding:
555
- assert not one_hot
556
- ce_dim = self.emb_dim if shared_embedding_added else (out_dim if out_dim else self.emb_dim) // n_shared_embs # used to be //8
557
- self.shared_emb = nn.Parameter(torch.empty(1, ce_dim).uniform_(-1, 1))
558
- self.do = nn.Dropout(p=p_dropout)
559
-
560
- def forward(self, input):
561
- if self.drop_whole_embeddings and self.training:
562
- mask = torch.zeros_like(input).bernoulli_(1 - self.p_dropout)
563
- input = input * mask
564
- out = self.embed(input)
565
- if not self.drop_whole_embeddings:
566
- out = self.do(out)
567
- out = self.reshape_out(out)
568
- if self.shared_embedding:
569
- shared_emb = self.shared_emb.expand(out.shape[0], -1)
570
- if not self.shared_embedding_added:
571
- out[:, : shared_emb.shape[1]] = shared_emb
572
- else:
573
- out += shared_emb
574
- return out
575
-
576
-
577
- def one_hot(x, card):
578
- assert isinstance(x, torch.LongTensor)
579
- assert x.dim() == 2
580
- x_one_hot = x.new_zeros(x.size()[0], card).scatter_(1, x, 1)
581
- return x_one_hot
582
-
583
-
584
- """
585
- These functions stolen wholesale, with much gratitude, from
586
- https://github.com/fastai/fastai/blob/master/fastai/tabular/transform.py
587
- """
588
-
589
-
590
- def make_date(df: DataFrame, date_field: str):
591
- "Make sure `df[field_name]` is of the right date type."
592
- field_dtype = df[date_field].dtype
593
- if isinstance(field_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
594
- field_dtype = np.datetime64
595
- if not np.issubdtype(field_dtype, np.datetime64):
596
- df[date_field] = pd.to_datetime(df[date_field], infer_datetime_format=True, format="mixed")
597
-
598
-
599
- def add_datepart(df: DataFrame, field_name: str, prefix: str = None, drop: bool = True, time: bool = False):
600
- "Helper function that adds columns relevant to a date in the column `field_name` of `df`."
601
- make_date(df, field_name)
602
- field = df[field_name]
603
- prefix = re.sub("[Dd]ate$", "", field_name) if prefix is None else prefix
604
- attr = [
605
- "Year",
606
- "Month",
607
- "Week",
608
- "Day",
609
- "Dayofweek",
610
- "Dayofyear",
611
- "Is_month_end",
612
- "Is_month_start",
613
- "Is_quarter_end",
614
- "Is_quarter_start",
615
- "Is_year_end",
616
- "Is_year_start",
617
- ]
618
- if time:
619
- attr = attr + ["Hour", "Minute", "Second"]
620
- for n in attr:
621
- df[prefix + n] = getattr(field.dt, n.lower())
622
- if drop:
623
- df.drop(field_name, axis=1, inplace=True)
624
- return df
625
-
626
-
627
- def cyclic_dt_feat_names(time: bool = True, add_linear: bool = False) -> list[str]:
628
- "Return feature names of date/time cycles as produced by `cyclic_dt_features`."
629
- fs = ["cos", "sin"]
630
- attr = [f"{r}_{f}" for r in "weekday day_month month_year day_year".split() for f in fs]
631
- if time:
632
- attr += [f"{r}_{f}" for r in "hour clock min sec".split() for f in fs]
633
- if add_linear:
634
- attr.append("year_lin")
635
- return attr
636
-
637
-
638
- def cyclic_dt_features(d: Union[date, datetime], time: bool = True, add_linear: bool = False) -> list[float]:
639
- "Calculate the cos and sin of date/time cycles."
640
- tt, fs = d.timetuple(), [np.cos, np.sin]
641
- day_year, days_month = tt.tm_yday, calendar.monthrange(d.year, d.month)[1]
642
- days_year = 366 if calendar.isleap(d.year) else 365
643
- rs = d.weekday() / 7, (d.day - 1) / days_month, (d.month - 1) / 12, (day_year - 1) / days_year
644
- feats = [f(r * 2 * np.pi) for r in rs for f in fs]
645
- if time and isinstance(d, datetime) and type(d) != date:
646
- rs = tt.tm_hour / 24, tt.tm_hour % 12 / 12, tt.tm_min / 60, tt.tm_sec / 60
647
- feats += [f(r * 2 * np.pi) for r in rs for f in fs]
648
- if add_linear:
649
- if type(d) == date:
650
- feats.append(d.year + rs[-1])
651
- else:
652
- secs_in_year = (datetime(d.year + 1, 1, 1) - datetime(d.year, 1, 1)).total_seconds()
653
- feats.append(d.year + ((d - datetime(d.year, 1, 1)).total_seconds() / secs_in_year))
654
- return feats
655
-
656
-
657
- def add_cyclic_datepart(df: DataFrame, field_name: str, prefix: str = None, drop: bool = True, time: bool = False, add_linear: bool = False):
658
- "Helper function that adds trigonometric date/time features to a date in the column `field_name` of `df`."
659
- make_date(df, field_name)
660
- field = df[field_name]
661
- prefix = re.sub("[Dd]ate$", "", field_name) if prefix is None else prefix
662
- series = field.apply(partial(cyclic_dt_features, time=time, add_linear=add_linear))
663
- columns = [prefix + c for c in cyclic_dt_feat_names(time, add_linear)]
664
- df_feats = pd.DataFrame([item for item in series], columns=columns)
665
- df = pd.concat([df, df_feats], axis=1)
666
- if drop:
667
- df.drop(field_name, axis=1, inplace=True)
668
- return df