replay-rec 0.18.0rc0__py3-none-any.whl → 0.18.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- replay/__init__.py +1 -1
- replay/data/dataset.py +27 -1
- replay/data/dataset_utils/dataset_label_encoder.py +6 -3
- replay/data/nn/schema.py +37 -16
- replay/data/nn/sequence_tokenizer.py +313 -165
- replay/data/nn/torch_sequential_dataset.py +17 -8
- replay/data/nn/utils.py +14 -7
- replay/data/schema.py +10 -6
- replay/metrics/offline_metrics.py +2 -2
- replay/models/__init__.py +1 -0
- replay/models/base_rec.py +18 -21
- replay/models/lin_ucb.py +407 -0
- replay/models/nn/sequential/bert4rec/dataset.py +17 -4
- replay/models/nn/sequential/bert4rec/lightning.py +121 -54
- replay/models/nn/sequential/bert4rec/model.py +21 -0
- replay/models/nn/sequential/callbacks/prediction_callbacks.py +5 -1
- replay/models/nn/sequential/compiled/__init__.py +5 -0
- replay/models/nn/sequential/compiled/base_compiled_model.py +261 -0
- replay/models/nn/sequential/compiled/bert4rec_compiled.py +152 -0
- replay/models/nn/sequential/compiled/sasrec_compiled.py +145 -0
- replay/models/nn/sequential/postprocessors/postprocessors.py +27 -1
- replay/models/nn/sequential/sasrec/dataset.py +17 -1
- replay/models/nn/sequential/sasrec/lightning.py +126 -50
- replay/models/nn/sequential/sasrec/model.py +3 -4
- replay/preprocessing/__init__.py +7 -1
- replay/preprocessing/discretizer.py +719 -0
- replay/preprocessing/label_encoder.py +384 -52
- replay/splitters/cold_user_random_splitter.py +1 -1
- replay/utils/__init__.py +1 -0
- replay/utils/common.py +7 -8
- replay/utils/session_handler.py +3 -4
- replay/utils/spark_utils.py +15 -1
- replay/utils/types.py +8 -0
- {replay_rec-0.18.0rc0.dist-info → replay_rec-0.18.1.dist-info}/METADATA +75 -70
- {replay_rec-0.18.0rc0.dist-info → replay_rec-0.18.1.dist-info}/RECORD +37 -84
- {replay_rec-0.18.0rc0.dist-info → replay_rec-0.18.1.dist-info}/WHEEL +1 -1
- replay/experimental/__init__.py +0 -0
- replay/experimental/metrics/__init__.py +0 -62
- replay/experimental/metrics/base_metric.py +0 -602
- replay/experimental/metrics/coverage.py +0 -97
- replay/experimental/metrics/experiment.py +0 -175
- replay/experimental/metrics/hitrate.py +0 -26
- replay/experimental/metrics/map.py +0 -30
- replay/experimental/metrics/mrr.py +0 -18
- replay/experimental/metrics/ncis_precision.py +0 -31
- replay/experimental/metrics/ndcg.py +0 -49
- replay/experimental/metrics/precision.py +0 -22
- replay/experimental/metrics/recall.py +0 -25
- replay/experimental/metrics/rocauc.py +0 -49
- replay/experimental/metrics/surprisal.py +0 -90
- replay/experimental/metrics/unexpectedness.py +0 -76
- replay/experimental/models/__init__.py +0 -10
- replay/experimental/models/admm_slim.py +0 -205
- replay/experimental/models/base_neighbour_rec.py +0 -204
- replay/experimental/models/base_rec.py +0 -1271
- replay/experimental/models/base_torch_rec.py +0 -234
- replay/experimental/models/cql.py +0 -454
- replay/experimental/models/ddpg.py +0 -923
- replay/experimental/models/dt4rec/__init__.py +0 -0
- replay/experimental/models/dt4rec/dt4rec.py +0 -189
- replay/experimental/models/dt4rec/gpt1.py +0 -401
- replay/experimental/models/dt4rec/trainer.py +0 -127
- replay/experimental/models/dt4rec/utils.py +0 -265
- replay/experimental/models/extensions/spark_custom_models/__init__.py +0 -0
- replay/experimental/models/extensions/spark_custom_models/als_extension.py +0 -792
- replay/experimental/models/implicit_wrap.py +0 -131
- replay/experimental/models/lightfm_wrap.py +0 -302
- replay/experimental/models/mult_vae.py +0 -332
- replay/experimental/models/neuromf.py +0 -406
- replay/experimental/models/scala_als.py +0 -296
- replay/experimental/nn/data/__init__.py +0 -1
- replay/experimental/nn/data/schema_builder.py +0 -55
- replay/experimental/preprocessing/__init__.py +0 -3
- replay/experimental/preprocessing/data_preparator.py +0 -839
- replay/experimental/preprocessing/padder.py +0 -229
- replay/experimental/preprocessing/sequence_generator.py +0 -208
- replay/experimental/scenarios/__init__.py +0 -1
- replay/experimental/scenarios/obp_wrapper/__init__.py +0 -8
- replay/experimental/scenarios/obp_wrapper/obp_optuna_objective.py +0 -74
- replay/experimental/scenarios/obp_wrapper/replay_offline.py +0 -248
- replay/experimental/scenarios/obp_wrapper/utils.py +0 -87
- replay/experimental/scenarios/two_stages/__init__.py +0 -0
- replay/experimental/scenarios/two_stages/reranker.py +0 -117
- replay/experimental/scenarios/two_stages/two_stages_scenario.py +0 -757
- replay/experimental/utils/__init__.py +0 -0
- replay/experimental/utils/logger.py +0 -24
- replay/experimental/utils/model_handler.py +0 -186
- replay/experimental/utils/session_handler.py +0 -44
- replay_rec-0.18.0rc0.dist-info/NOTICE +0 -41
- {replay_rec-0.18.0rc0.dist-info → replay_rec-0.18.1.dist-info}/LICENSE +0 -0
replay/__init__.py
CHANGED
replay/data/dataset.py
CHANGED
|
@@ -458,13 +458,23 @@ class Dataset:
|
|
|
458
458
|
if feature.feature_hint in [FeatureHint.ITEM_ID, FeatureHint.QUERY_ID]:
|
|
459
459
|
return nunique(self._ids_feature_map[feature.feature_hint], column)
|
|
460
460
|
assert feature.feature_source
|
|
461
|
+
if feature.feature_type == FeatureType.CATEGORICAL_LIST:
|
|
462
|
+
if self.is_spark:
|
|
463
|
+
data = (
|
|
464
|
+
self._feature_source_map[feature.feature_source]
|
|
465
|
+
.select(column)
|
|
466
|
+
.withColumn(column, sf.explode(column))
|
|
467
|
+
)
|
|
468
|
+
else:
|
|
469
|
+
data = self._feature_source_map[feature.feature_source][[column]].explode(column)
|
|
470
|
+
return nunique(data, column)
|
|
461
471
|
return nunique(self._feature_source_map[feature.feature_source], column)
|
|
462
472
|
|
|
463
473
|
return callback
|
|
464
474
|
|
|
465
475
|
def _set_cardinality(self, features_list: Sequence[FeatureInfo]) -> None:
|
|
466
476
|
for feature in features_list:
|
|
467
|
-
if feature.feature_type
|
|
477
|
+
if feature.feature_type in [FeatureType.CATEGORICAL, FeatureType.CATEGORICAL_LIST]:
|
|
468
478
|
feature._set_cardinality_callback(self._get_cardinality(feature))
|
|
469
479
|
|
|
470
480
|
def _fill_feature_schema(self, feature_schema: FeatureSchema) -> FeatureSchema:
|
|
@@ -581,6 +591,7 @@ class Dataset:
|
|
|
581
591
|
data: DataFrameLike,
|
|
582
592
|
column: str,
|
|
583
593
|
source: FeatureSource,
|
|
594
|
+
feature_type: FeatureType,
|
|
584
595
|
cardinality: Optional[int],
|
|
585
596
|
) -> None:
|
|
586
597
|
"""
|
|
@@ -593,6 +604,16 @@ class Dataset:
|
|
|
593
604
|
Option: Keep this criterion, but suggest the user to disable the check if he understands
|
|
594
605
|
that the criterion will not pass.
|
|
595
606
|
"""
|
|
607
|
+
if feature_type == FeatureType.CATEGORICAL_LIST: # explode column if list
|
|
608
|
+
data = data.withColumn(column, sf.explode(column)) if self.is_spark else data[[column]].explode(column)
|
|
609
|
+
|
|
610
|
+
if self.is_pandas:
|
|
611
|
+
try:
|
|
612
|
+
data[column] = data[column].astype(int)
|
|
613
|
+
except Exception:
|
|
614
|
+
msg = f"IDs in {source.name}.{column} are not encoded. They are not int."
|
|
615
|
+
raise ValueError(msg)
|
|
616
|
+
|
|
596
617
|
if self.is_pandas:
|
|
597
618
|
is_int = np.issubdtype(dict(data.dtypes)[column], int)
|
|
598
619
|
elif self.is_spark:
|
|
@@ -632,6 +653,7 @@ class Dataset:
|
|
|
632
653
|
self.interactions,
|
|
633
654
|
feature.column,
|
|
634
655
|
FeatureSource.INTERACTIONS,
|
|
656
|
+
feature.feature_type,
|
|
635
657
|
feature.cardinality,
|
|
636
658
|
)
|
|
637
659
|
if self.item_features is not None:
|
|
@@ -639,6 +661,7 @@ class Dataset:
|
|
|
639
661
|
self.item_features,
|
|
640
662
|
feature.column,
|
|
641
663
|
FeatureSource.ITEM_FEATURES,
|
|
664
|
+
feature.feature_type,
|
|
642
665
|
feature.cardinality,
|
|
643
666
|
)
|
|
644
667
|
elif feature.feature_hint == FeatureHint.QUERY_ID:
|
|
@@ -646,6 +669,7 @@ class Dataset:
|
|
|
646
669
|
self.interactions,
|
|
647
670
|
feature.column,
|
|
648
671
|
FeatureSource.INTERACTIONS,
|
|
672
|
+
feature.feature_type,
|
|
649
673
|
feature.cardinality,
|
|
650
674
|
)
|
|
651
675
|
if self.query_features is not None:
|
|
@@ -653,6 +677,7 @@ class Dataset:
|
|
|
653
677
|
self.query_features,
|
|
654
678
|
feature.column,
|
|
655
679
|
FeatureSource.QUERY_FEATURES,
|
|
680
|
+
feature.feature_type,
|
|
656
681
|
feature.cardinality,
|
|
657
682
|
)
|
|
658
683
|
else:
|
|
@@ -661,6 +686,7 @@ class Dataset:
|
|
|
661
686
|
data,
|
|
662
687
|
feature.column,
|
|
663
688
|
feature.feature_source,
|
|
689
|
+
feature.feature_type,
|
|
664
690
|
feature.cardinality,
|
|
665
691
|
)
|
|
666
692
|
|
|
@@ -8,8 +8,8 @@ Contains classes for encoding categorical data
|
|
|
8
8
|
import warnings
|
|
9
9
|
from typing import Dict, Iterable, Iterator, Optional, Sequence, Set, Union
|
|
10
10
|
|
|
11
|
-
from replay.data import Dataset, FeatureHint, FeatureSchema, FeatureSource
|
|
12
|
-
from replay.preprocessing import LabelEncoder, LabelEncodingRule
|
|
11
|
+
from replay.data import Dataset, FeatureHint, FeatureSchema, FeatureSource, FeatureType
|
|
12
|
+
from replay.preprocessing import LabelEncoder, LabelEncodingRule, SequenceEncodingRule
|
|
13
13
|
from replay.preprocessing.label_encoder import HandleUnknownStrategies
|
|
14
14
|
|
|
15
15
|
|
|
@@ -62,7 +62,10 @@ class DatasetLabelEncoder:
|
|
|
62
62
|
|
|
63
63
|
self._fill_features_columns(dataset.feature_schema)
|
|
64
64
|
for column, feature_info in dataset.feature_schema.categorical_features.items():
|
|
65
|
-
|
|
65
|
+
encoding_rule_class = (
|
|
66
|
+
SequenceEncodingRule if feature_info.feature_type == FeatureType.CATEGORICAL_LIST else LabelEncodingRule
|
|
67
|
+
)
|
|
68
|
+
encoding_rule = encoding_rule_class(
|
|
66
69
|
column, handle_unknown=self._handle_unknown_rule, default_value=self._default_value_rule
|
|
67
70
|
)
|
|
68
71
|
if feature_info.feature_hint == FeatureHint.QUERY_ID:
|
replay/data/nn/schema.py
CHANGED
|
@@ -70,6 +70,8 @@ class TensorFeatureInfo:
|
|
|
70
70
|
Information about a tensor feature.
|
|
71
71
|
"""
|
|
72
72
|
|
|
73
|
+
DEFAULT_EMBEDDING_DIM = 64
|
|
74
|
+
|
|
73
75
|
def __init__(
|
|
74
76
|
self,
|
|
75
77
|
name: str,
|
|
@@ -78,6 +80,7 @@ class TensorFeatureInfo:
|
|
|
78
80
|
feature_hint: Optional[FeatureHint] = None,
|
|
79
81
|
feature_sources: Optional[List[TensorFeatureSource]] = None,
|
|
80
82
|
cardinality: Optional[int] = None,
|
|
83
|
+
padding_value: int = 0,
|
|
81
84
|
embedding_dim: Optional[int] = None,
|
|
82
85
|
tensor_dim: Optional[int] = None,
|
|
83
86
|
) -> None:
|
|
@@ -94,6 +97,7 @@ class TensorFeatureInfo:
|
|
|
94
97
|
:param cardinality: cardinality of categorical feature, required for ids columns,
|
|
95
98
|
optional for others,
|
|
96
99
|
default: ``None``.
|
|
100
|
+
:param padding_value: value to pad sequences to desired length
|
|
97
101
|
:param embedding_dim: embedding dimensions of categorical feature,
|
|
98
102
|
default: ``None``.
|
|
99
103
|
:param tensor_dim: tensor dimensions of numerical feature,
|
|
@@ -103,24 +107,24 @@ class TensorFeatureInfo:
|
|
|
103
107
|
self._feature_hint = feature_hint
|
|
104
108
|
self._feature_sources = feature_sources
|
|
105
109
|
self._is_seq = is_seq
|
|
110
|
+
self._padding_value = padding_value
|
|
106
111
|
|
|
107
112
|
if not isinstance(feature_type, FeatureType):
|
|
108
113
|
msg = "Unknown feature type"
|
|
109
114
|
raise ValueError(msg)
|
|
110
115
|
self._feature_type = feature_type
|
|
111
116
|
|
|
112
|
-
if feature_type
|
|
117
|
+
if feature_type in [FeatureType.NUMERICAL, FeatureType.NUMERICAL_LIST] and (cardinality or embedding_dim):
|
|
113
118
|
msg = "Cardinality and embedding dimensions are needed only with categorical feature type."
|
|
114
119
|
raise ValueError(msg)
|
|
115
120
|
self._cardinality = cardinality
|
|
116
121
|
|
|
117
|
-
if feature_type
|
|
122
|
+
if feature_type in [FeatureType.CATEGORICAL, FeatureType.CATEGORICAL_LIST] and tensor_dim:
|
|
118
123
|
msg = "Tensor dimensions is needed only with numerical feature type."
|
|
119
124
|
raise ValueError(msg)
|
|
120
125
|
|
|
121
|
-
if feature_type
|
|
122
|
-
|
|
123
|
-
self._embedding_dim = embedding_dim or default_embedding_dim
|
|
126
|
+
if feature_type in [FeatureType.CATEGORICAL, FeatureType.CATEGORICAL_LIST]:
|
|
127
|
+
self._embedding_dim = embedding_dim or self.DEFAULT_EMBEDDING_DIM
|
|
124
128
|
else:
|
|
125
129
|
self._tensor_dim = tensor_dim
|
|
126
130
|
|
|
@@ -176,7 +180,8 @@ class TensorFeatureInfo:
|
|
|
176
180
|
@property
|
|
177
181
|
def is_seq(self) -> bool:
|
|
178
182
|
"""
|
|
179
|
-
:returns: Flag that feature is sequential
|
|
183
|
+
:returns: Flag that feature is sequential.\n
|
|
184
|
+
Sequential means that the value of the feature will be determined for each element of the user's sequence.
|
|
180
185
|
"""
|
|
181
186
|
return self._is_seq
|
|
182
187
|
|
|
@@ -185,21 +190,35 @@ class TensorFeatureInfo:
|
|
|
185
190
|
"""
|
|
186
191
|
:returns: Flag that feature is categorical.
|
|
187
192
|
"""
|
|
188
|
-
return self.feature_type
|
|
193
|
+
return self.feature_type in [FeatureType.CATEGORICAL, FeatureType.CATEGORICAL_LIST]
|
|
189
194
|
|
|
190
195
|
@property
|
|
191
196
|
def is_num(self) -> bool:
|
|
192
197
|
"""
|
|
193
198
|
:returns: Flag that feature is numerical.
|
|
194
199
|
"""
|
|
195
|
-
return self.feature_type
|
|
200
|
+
return self.feature_type in [FeatureType.NUMERICAL, FeatureType.NUMERICAL_LIST]
|
|
201
|
+
|
|
202
|
+
@property
|
|
203
|
+
def is_list(self) -> bool:
|
|
204
|
+
"""
|
|
205
|
+
:returns: Flag that feature is numerical list or categorical list.
|
|
206
|
+
"""
|
|
207
|
+
return self.feature_type in [FeatureType.CATEGORICAL_LIST, FeatureType.NUMERICAL_LIST]
|
|
208
|
+
|
|
209
|
+
@property
|
|
210
|
+
def padding_value(self) -> int:
|
|
211
|
+
"""
|
|
212
|
+
:returns: value to pad sequences to desired length.
|
|
213
|
+
"""
|
|
214
|
+
return self._padding_value
|
|
196
215
|
|
|
197
216
|
@property
|
|
198
217
|
def cardinality(self) -> Optional[int]:
|
|
199
218
|
"""
|
|
200
219
|
:returns: Cardinality of the feature.
|
|
201
220
|
"""
|
|
202
|
-
if self.
|
|
221
|
+
if not self.is_cat:
|
|
203
222
|
msg = f"Can not get cardinality because feature type of {self.name} column is not categorical."
|
|
204
223
|
raise RuntimeError(msg)
|
|
205
224
|
return self._cardinality
|
|
@@ -212,7 +231,7 @@ class TensorFeatureInfo:
|
|
|
212
231
|
"""
|
|
213
232
|
:returns: Dimensions of the numerical feature.
|
|
214
233
|
"""
|
|
215
|
-
if self.
|
|
234
|
+
if not self.is_num:
|
|
216
235
|
msg = f"Can not get tensor dimensions because feature type of {self.name} feature is not numerical."
|
|
217
236
|
raise RuntimeError(msg)
|
|
218
237
|
return self._tensor_dim
|
|
@@ -225,7 +244,7 @@ class TensorFeatureInfo:
|
|
|
225
244
|
"""
|
|
226
245
|
:returns: Embedding dimensions of the feature.
|
|
227
246
|
"""
|
|
228
|
-
if self.
|
|
247
|
+
if not self.is_cat:
|
|
229
248
|
msg = f"Can not get embedding dimensions because feature type of {self.name} feature is not categorical."
|
|
230
249
|
raise RuntimeError(msg)
|
|
231
250
|
return self._embedding_dim
|
|
@@ -317,14 +336,16 @@ class TensorSchema(Mapping[str, TensorFeatureInfo]):
|
|
|
317
336
|
"""
|
|
318
337
|
:returns: Sequence of categorical features in a schema.
|
|
319
338
|
"""
|
|
320
|
-
return self.filter(feature_type=FeatureType.CATEGORICAL)
|
|
339
|
+
return self.filter(feature_type=FeatureType.CATEGORICAL) + self.filter(
|
|
340
|
+
feature_type=FeatureType.CATEGORICAL_LIST
|
|
341
|
+
)
|
|
321
342
|
|
|
322
343
|
@property
|
|
323
344
|
def numerical_features(self) -> "TensorSchema":
|
|
324
345
|
"""
|
|
325
346
|
:returns: Sequence of numerical features in a schema.
|
|
326
347
|
"""
|
|
327
|
-
return self.filter(feature_type=FeatureType.NUMERICAL)
|
|
348
|
+
return self.filter(feature_type=FeatureType.NUMERICAL) + self.filter(feature_type=FeatureType.NUMERICAL_LIST)
|
|
328
349
|
|
|
329
350
|
@property
|
|
330
351
|
def query_id_features(self) -> "TensorSchema":
|
|
@@ -423,9 +444,9 @@ class TensorSchema(Mapping[str, TensorFeatureInfo]):
|
|
|
423
444
|
if feature.feature_sources
|
|
424
445
|
else None
|
|
425
446
|
),
|
|
426
|
-
"cardinality": feature.cardinality if feature.
|
|
427
|
-
"embedding_dim": feature.embedding_dim if feature.
|
|
428
|
-
"tensor_dim": feature.tensor_dim if feature.
|
|
447
|
+
"cardinality": feature.cardinality if feature.is_cat else None,
|
|
448
|
+
"embedding_dim": feature.embedding_dim if feature.is_cat else None,
|
|
449
|
+
"tensor_dim": feature.tensor_dim if feature.is_num else None,
|
|
429
450
|
}
|
|
430
451
|
for feature in self.all_features
|
|
431
452
|
]
|