replay-rec 0.18.0rc0__py3-none-any.whl → 0.18.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. replay/__init__.py +1 -1
  2. replay/data/dataset.py +27 -1
  3. replay/data/dataset_utils/dataset_label_encoder.py +6 -3
  4. replay/data/nn/schema.py +37 -16
  5. replay/data/nn/sequence_tokenizer.py +313 -165
  6. replay/data/nn/torch_sequential_dataset.py +17 -8
  7. replay/data/nn/utils.py +14 -7
  8. replay/data/schema.py +10 -6
  9. replay/metrics/offline_metrics.py +2 -2
  10. replay/models/__init__.py +1 -0
  11. replay/models/base_rec.py +18 -21
  12. replay/models/lin_ucb.py +407 -0
  13. replay/models/nn/sequential/bert4rec/dataset.py +17 -4
  14. replay/models/nn/sequential/bert4rec/lightning.py +121 -54
  15. replay/models/nn/sequential/bert4rec/model.py +21 -0
  16. replay/models/nn/sequential/callbacks/prediction_callbacks.py +5 -1
  17. replay/models/nn/sequential/compiled/__init__.py +5 -0
  18. replay/models/nn/sequential/compiled/base_compiled_model.py +261 -0
  19. replay/models/nn/sequential/compiled/bert4rec_compiled.py +152 -0
  20. replay/models/nn/sequential/compiled/sasrec_compiled.py +145 -0
  21. replay/models/nn/sequential/postprocessors/postprocessors.py +27 -1
  22. replay/models/nn/sequential/sasrec/dataset.py +17 -1
  23. replay/models/nn/sequential/sasrec/lightning.py +126 -50
  24. replay/models/nn/sequential/sasrec/model.py +3 -4
  25. replay/preprocessing/__init__.py +7 -1
  26. replay/preprocessing/discretizer.py +719 -0
  27. replay/preprocessing/label_encoder.py +384 -52
  28. replay/splitters/cold_user_random_splitter.py +1 -1
  29. replay/utils/__init__.py +1 -0
  30. replay/utils/common.py +7 -8
  31. replay/utils/session_handler.py +3 -4
  32. replay/utils/spark_utils.py +15 -1
  33. replay/utils/types.py +8 -0
  34. {replay_rec-0.18.0rc0.dist-info → replay_rec-0.18.1.dist-info}/METADATA +75 -70
  35. {replay_rec-0.18.0rc0.dist-info → replay_rec-0.18.1.dist-info}/RECORD +37 -84
  36. {replay_rec-0.18.0rc0.dist-info → replay_rec-0.18.1.dist-info}/WHEEL +1 -1
  37. replay/experimental/__init__.py +0 -0
  38. replay/experimental/metrics/__init__.py +0 -62
  39. replay/experimental/metrics/base_metric.py +0 -602
  40. replay/experimental/metrics/coverage.py +0 -97
  41. replay/experimental/metrics/experiment.py +0 -175
  42. replay/experimental/metrics/hitrate.py +0 -26
  43. replay/experimental/metrics/map.py +0 -30
  44. replay/experimental/metrics/mrr.py +0 -18
  45. replay/experimental/metrics/ncis_precision.py +0 -31
  46. replay/experimental/metrics/ndcg.py +0 -49
  47. replay/experimental/metrics/precision.py +0 -22
  48. replay/experimental/metrics/recall.py +0 -25
  49. replay/experimental/metrics/rocauc.py +0 -49
  50. replay/experimental/metrics/surprisal.py +0 -90
  51. replay/experimental/metrics/unexpectedness.py +0 -76
  52. replay/experimental/models/__init__.py +0 -10
  53. replay/experimental/models/admm_slim.py +0 -205
  54. replay/experimental/models/base_neighbour_rec.py +0 -204
  55. replay/experimental/models/base_rec.py +0 -1271
  56. replay/experimental/models/base_torch_rec.py +0 -234
  57. replay/experimental/models/cql.py +0 -454
  58. replay/experimental/models/ddpg.py +0 -923
  59. replay/experimental/models/dt4rec/__init__.py +0 -0
  60. replay/experimental/models/dt4rec/dt4rec.py +0 -189
  61. replay/experimental/models/dt4rec/gpt1.py +0 -401
  62. replay/experimental/models/dt4rec/trainer.py +0 -127
  63. replay/experimental/models/dt4rec/utils.py +0 -265
  64. replay/experimental/models/extensions/spark_custom_models/__init__.py +0 -0
  65. replay/experimental/models/extensions/spark_custom_models/als_extension.py +0 -792
  66. replay/experimental/models/implicit_wrap.py +0 -131
  67. replay/experimental/models/lightfm_wrap.py +0 -302
  68. replay/experimental/models/mult_vae.py +0 -332
  69. replay/experimental/models/neuromf.py +0 -406
  70. replay/experimental/models/scala_als.py +0 -296
  71. replay/experimental/nn/data/__init__.py +0 -1
  72. replay/experimental/nn/data/schema_builder.py +0 -55
  73. replay/experimental/preprocessing/__init__.py +0 -3
  74. replay/experimental/preprocessing/data_preparator.py +0 -839
  75. replay/experimental/preprocessing/padder.py +0 -229
  76. replay/experimental/preprocessing/sequence_generator.py +0 -208
  77. replay/experimental/scenarios/__init__.py +0 -1
  78. replay/experimental/scenarios/obp_wrapper/__init__.py +0 -8
  79. replay/experimental/scenarios/obp_wrapper/obp_optuna_objective.py +0 -74
  80. replay/experimental/scenarios/obp_wrapper/replay_offline.py +0 -248
  81. replay/experimental/scenarios/obp_wrapper/utils.py +0 -87
  82. replay/experimental/scenarios/two_stages/__init__.py +0 -0
  83. replay/experimental/scenarios/two_stages/reranker.py +0 -117
  84. replay/experimental/scenarios/two_stages/two_stages_scenario.py +0 -757
  85. replay/experimental/utils/__init__.py +0 -0
  86. replay/experimental/utils/logger.py +0 -24
  87. replay/experimental/utils/model_handler.py +0 -186
  88. replay/experimental/utils/session_handler.py +0 -44
  89. replay_rec-0.18.0rc0.dist-info/NOTICE +0 -41
  90. {replay_rec-0.18.0rc0.dist-info → replay_rec-0.18.1.dist-info}/LICENSE +0 -0
replay/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """ RecSys library """
2
2
 
3
- __version__ = "0.18.0.preview"
3
+ __version__ = "0.18.1"
replay/data/dataset.py CHANGED
@@ -458,13 +458,23 @@ class Dataset:
458
458
  if feature.feature_hint in [FeatureHint.ITEM_ID, FeatureHint.QUERY_ID]:
459
459
  return nunique(self._ids_feature_map[feature.feature_hint], column)
460
460
  assert feature.feature_source
461
+ if feature.feature_type == FeatureType.CATEGORICAL_LIST:
462
+ if self.is_spark:
463
+ data = (
464
+ self._feature_source_map[feature.feature_source]
465
+ .select(column)
466
+ .withColumn(column, sf.explode(column))
467
+ )
468
+ else:
469
+ data = self._feature_source_map[feature.feature_source][[column]].explode(column)
470
+ return nunique(data, column)
461
471
  return nunique(self._feature_source_map[feature.feature_source], column)
462
472
 
463
473
  return callback
464
474
 
465
475
  def _set_cardinality(self, features_list: Sequence[FeatureInfo]) -> None:
466
476
  for feature in features_list:
467
- if feature.feature_type == FeatureType.CATEGORICAL:
477
+ if feature.feature_type in [FeatureType.CATEGORICAL, FeatureType.CATEGORICAL_LIST]:
468
478
  feature._set_cardinality_callback(self._get_cardinality(feature))
469
479
 
470
480
  def _fill_feature_schema(self, feature_schema: FeatureSchema) -> FeatureSchema:
@@ -581,6 +591,7 @@ class Dataset:
581
591
  data: DataFrameLike,
582
592
  column: str,
583
593
  source: FeatureSource,
594
+ feature_type: FeatureType,
584
595
  cardinality: Optional[int],
585
596
  ) -> None:
586
597
  """
@@ -593,6 +604,16 @@ class Dataset:
593
604
  Option: Keep this criterion, but suggest the user to disable the check if he understands
594
605
  that the criterion will not pass.
595
606
  """
607
+ if feature_type == FeatureType.CATEGORICAL_LIST: # explode column if list
608
+ data = data.withColumn(column, sf.explode(column)) if self.is_spark else data[[column]].explode(column)
609
+
610
+ if self.is_pandas:
611
+ try:
612
+ data[column] = data[column].astype(int)
613
+ except Exception:
614
+ msg = f"IDs in {source.name}.{column} are not encoded. They are not int."
615
+ raise ValueError(msg)
616
+
596
617
  if self.is_pandas:
597
618
  is_int = np.issubdtype(dict(data.dtypes)[column], int)
598
619
  elif self.is_spark:
@@ -632,6 +653,7 @@ class Dataset:
632
653
  self.interactions,
633
654
  feature.column,
634
655
  FeatureSource.INTERACTIONS,
656
+ feature.feature_type,
635
657
  feature.cardinality,
636
658
  )
637
659
  if self.item_features is not None:
@@ -639,6 +661,7 @@ class Dataset:
639
661
  self.item_features,
640
662
  feature.column,
641
663
  FeatureSource.ITEM_FEATURES,
664
+ feature.feature_type,
642
665
  feature.cardinality,
643
666
  )
644
667
  elif feature.feature_hint == FeatureHint.QUERY_ID:
@@ -646,6 +669,7 @@ class Dataset:
646
669
  self.interactions,
647
670
  feature.column,
648
671
  FeatureSource.INTERACTIONS,
672
+ feature.feature_type,
649
673
  feature.cardinality,
650
674
  )
651
675
  if self.query_features is not None:
@@ -653,6 +677,7 @@ class Dataset:
653
677
  self.query_features,
654
678
  feature.column,
655
679
  FeatureSource.QUERY_FEATURES,
680
+ feature.feature_type,
656
681
  feature.cardinality,
657
682
  )
658
683
  else:
@@ -661,6 +686,7 @@ class Dataset:
661
686
  data,
662
687
  feature.column,
663
688
  feature.feature_source,
689
+ feature.feature_type,
664
690
  feature.cardinality,
665
691
  )
666
692
 
@@ -8,8 +8,8 @@ Contains classes for encoding categorical data
8
8
  import warnings
9
9
  from typing import Dict, Iterable, Iterator, Optional, Sequence, Set, Union
10
10
 
11
- from replay.data import Dataset, FeatureHint, FeatureSchema, FeatureSource
12
- from replay.preprocessing import LabelEncoder, LabelEncodingRule
11
+ from replay.data import Dataset, FeatureHint, FeatureSchema, FeatureSource, FeatureType
12
+ from replay.preprocessing import LabelEncoder, LabelEncodingRule, SequenceEncodingRule
13
13
  from replay.preprocessing.label_encoder import HandleUnknownStrategies
14
14
 
15
15
 
@@ -62,7 +62,10 @@ class DatasetLabelEncoder:
62
62
 
63
63
  self._fill_features_columns(dataset.feature_schema)
64
64
  for column, feature_info in dataset.feature_schema.categorical_features.items():
65
- encoding_rule = LabelEncodingRule(
65
+ encoding_rule_class = (
66
+ SequenceEncodingRule if feature_info.feature_type == FeatureType.CATEGORICAL_LIST else LabelEncodingRule
67
+ )
68
+ encoding_rule = encoding_rule_class(
66
69
  column, handle_unknown=self._handle_unknown_rule, default_value=self._default_value_rule
67
70
  )
68
71
  if feature_info.feature_hint == FeatureHint.QUERY_ID:
replay/data/nn/schema.py CHANGED
@@ -70,6 +70,8 @@ class TensorFeatureInfo:
70
70
  Information about a tensor feature.
71
71
  """
72
72
 
73
+ DEFAULT_EMBEDDING_DIM = 64
74
+
73
75
  def __init__(
74
76
  self,
75
77
  name: str,
@@ -78,6 +80,7 @@ class TensorFeatureInfo:
78
80
  feature_hint: Optional[FeatureHint] = None,
79
81
  feature_sources: Optional[List[TensorFeatureSource]] = None,
80
82
  cardinality: Optional[int] = None,
83
+ padding_value: int = 0,
81
84
  embedding_dim: Optional[int] = None,
82
85
  tensor_dim: Optional[int] = None,
83
86
  ) -> None:
@@ -94,6 +97,7 @@ class TensorFeatureInfo:
94
97
  :param cardinality: cardinality of categorical feature, required for ids columns,
95
98
  optional for others,
96
99
  default: ``None``.
100
+ :param padding_value: value to pad sequences to desired length
97
101
  :param embedding_dim: embedding dimensions of categorical feature,
98
102
  default: ``None``.
99
103
  :param tensor_dim: tensor dimensions of numerical feature,
@@ -103,24 +107,24 @@ class TensorFeatureInfo:
103
107
  self._feature_hint = feature_hint
104
108
  self._feature_sources = feature_sources
105
109
  self._is_seq = is_seq
110
+ self._padding_value = padding_value
106
111
 
107
112
  if not isinstance(feature_type, FeatureType):
108
113
  msg = "Unknown feature type"
109
114
  raise ValueError(msg)
110
115
  self._feature_type = feature_type
111
116
 
112
- if feature_type == FeatureType.NUMERICAL and (cardinality or embedding_dim):
117
+ if feature_type in [FeatureType.NUMERICAL, FeatureType.NUMERICAL_LIST] and (cardinality or embedding_dim):
113
118
  msg = "Cardinality and embedding dimensions are needed only with categorical feature type."
114
119
  raise ValueError(msg)
115
120
  self._cardinality = cardinality
116
121
 
117
- if feature_type == FeatureType.CATEGORICAL and tensor_dim:
122
+ if feature_type in [FeatureType.CATEGORICAL, FeatureType.CATEGORICAL_LIST] and tensor_dim:
118
123
  msg = "Tensor dimensions is needed only with numerical feature type."
119
124
  raise ValueError(msg)
120
125
 
121
- if feature_type == FeatureType.CATEGORICAL:
122
- default_embedding_dim = 64
123
- self._embedding_dim = embedding_dim or default_embedding_dim
126
+ if feature_type in [FeatureType.CATEGORICAL, FeatureType.CATEGORICAL_LIST]:
127
+ self._embedding_dim = embedding_dim or self.DEFAULT_EMBEDDING_DIM
124
128
  else:
125
129
  self._tensor_dim = tensor_dim
126
130
 
@@ -176,7 +180,8 @@ class TensorFeatureInfo:
176
180
  @property
177
181
  def is_seq(self) -> bool:
178
182
  """
179
- :returns: Flag that feature is sequential.
183
+ :returns: Flag that feature is sequential.\n
184
+ Sequential means that the value of the feature will be determined for each element of the user's sequence.
180
185
  """
181
186
  return self._is_seq
182
187
 
@@ -185,21 +190,35 @@ class TensorFeatureInfo:
185
190
  """
186
191
  :returns: Flag that feature is categorical.
187
192
  """
188
- return self.feature_type == FeatureType.CATEGORICAL
193
+ return self.feature_type in [FeatureType.CATEGORICAL, FeatureType.CATEGORICAL_LIST]
189
194
 
190
195
  @property
191
196
  def is_num(self) -> bool:
192
197
  """
193
198
  :returns: Flag that feature is numerical.
194
199
  """
195
- return self.feature_type == FeatureType.NUMERICAL
200
+ return self.feature_type in [FeatureType.NUMERICAL, FeatureType.NUMERICAL_LIST]
201
+
202
+ @property
203
+ def is_list(self) -> bool:
204
+ """
205
+ :returns: Flag that feature is numerical list or categorical list.
206
+ """
207
+ return self.feature_type in [FeatureType.CATEGORICAL_LIST, FeatureType.NUMERICAL_LIST]
208
+
209
+ @property
210
+ def padding_value(self) -> int:
211
+ """
212
+ :returns: value to pad sequences to desired length.
213
+ """
214
+ return self._padding_value
196
215
 
197
216
  @property
198
217
  def cardinality(self) -> Optional[int]:
199
218
  """
200
219
  :returns: Cardinality of the feature.
201
220
  """
202
- if self.feature_type != FeatureType.CATEGORICAL:
221
+ if not self.is_cat:
203
222
  msg = f"Can not get cardinality because feature type of {self.name} column is not categorical."
204
223
  raise RuntimeError(msg)
205
224
  return self._cardinality
@@ -212,7 +231,7 @@ class TensorFeatureInfo:
212
231
  """
213
232
  :returns: Dimensions of the numerical feature.
214
233
  """
215
- if self.feature_type != FeatureType.NUMERICAL:
234
+ if not self.is_num:
216
235
  msg = f"Can not get tensor dimensions because feature type of {self.name} feature is not numerical."
217
236
  raise RuntimeError(msg)
218
237
  return self._tensor_dim
@@ -225,7 +244,7 @@ class TensorFeatureInfo:
225
244
  """
226
245
  :returns: Embedding dimensions of the feature.
227
246
  """
228
- if self.feature_type != FeatureType.CATEGORICAL:
247
+ if not self.is_cat:
229
248
  msg = f"Can not get embedding dimensions because feature type of {self.name} feature is not categorical."
230
249
  raise RuntimeError(msg)
231
250
  return self._embedding_dim
@@ -317,14 +336,16 @@ class TensorSchema(Mapping[str, TensorFeatureInfo]):
317
336
  """
318
337
  :returns: Sequence of categorical features in a schema.
319
338
  """
320
- return self.filter(feature_type=FeatureType.CATEGORICAL)
339
+ return self.filter(feature_type=FeatureType.CATEGORICAL) + self.filter(
340
+ feature_type=FeatureType.CATEGORICAL_LIST
341
+ )
321
342
 
322
343
  @property
323
344
  def numerical_features(self) -> "TensorSchema":
324
345
  """
325
346
  :returns: Sequence of numerical features in a schema.
326
347
  """
327
- return self.filter(feature_type=FeatureType.NUMERICAL)
348
+ return self.filter(feature_type=FeatureType.NUMERICAL) + self.filter(feature_type=FeatureType.NUMERICAL_LIST)
328
349
 
329
350
  @property
330
351
  def query_id_features(self) -> "TensorSchema":
@@ -423,9 +444,9 @@ class TensorSchema(Mapping[str, TensorFeatureInfo]):
423
444
  if feature.feature_sources
424
445
  else None
425
446
  ),
426
- "cardinality": feature.cardinality if feature.feature_type == FeatureType.CATEGORICAL else None,
427
- "embedding_dim": feature.embedding_dim if feature.feature_type == FeatureType.CATEGORICAL else None,
428
- "tensor_dim": feature.tensor_dim if feature.feature_type == FeatureType.NUMERICAL else None,
447
+ "cardinality": feature.cardinality if feature.is_cat else None,
448
+ "embedding_dim": feature.embedding_dim if feature.is_cat else None,
449
+ "tensor_dim": feature.tensor_dim if feature.is_num else None,
429
450
  }
430
451
  for feature in self.all_features
431
452
  ]