replay-rec 0.18.0rc0__py3-none-any.whl → 0.18.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. replay/__init__.py +1 -1
  2. replay/data/dataset.py +27 -1
  3. replay/data/dataset_utils/dataset_label_encoder.py +6 -3
  4. replay/data/nn/schema.py +37 -16
  5. replay/data/nn/sequence_tokenizer.py +313 -165
  6. replay/data/nn/torch_sequential_dataset.py +17 -8
  7. replay/data/nn/utils.py +14 -7
  8. replay/data/schema.py +10 -6
  9. replay/metrics/offline_metrics.py +2 -2
  10. replay/models/__init__.py +1 -0
  11. replay/models/base_rec.py +18 -21
  12. replay/models/lin_ucb.py +407 -0
  13. replay/models/nn/sequential/bert4rec/dataset.py +17 -4
  14. replay/models/nn/sequential/bert4rec/lightning.py +121 -54
  15. replay/models/nn/sequential/bert4rec/model.py +21 -0
  16. replay/models/nn/sequential/callbacks/prediction_callbacks.py +5 -1
  17. replay/models/nn/sequential/compiled/__init__.py +5 -0
  18. replay/models/nn/sequential/compiled/base_compiled_model.py +261 -0
  19. replay/models/nn/sequential/compiled/bert4rec_compiled.py +152 -0
  20. replay/models/nn/sequential/compiled/sasrec_compiled.py +145 -0
  21. replay/models/nn/sequential/postprocessors/postprocessors.py +27 -1
  22. replay/models/nn/sequential/sasrec/dataset.py +17 -1
  23. replay/models/nn/sequential/sasrec/lightning.py +126 -50
  24. replay/models/nn/sequential/sasrec/model.py +3 -4
  25. replay/preprocessing/__init__.py +7 -1
  26. replay/preprocessing/discretizer.py +719 -0
  27. replay/preprocessing/label_encoder.py +384 -52
  28. replay/splitters/cold_user_random_splitter.py +1 -1
  29. replay/utils/__init__.py +1 -0
  30. replay/utils/common.py +7 -8
  31. replay/utils/session_handler.py +3 -4
  32. replay/utils/spark_utils.py +15 -1
  33. replay/utils/types.py +8 -0
  34. {replay_rec-0.18.0rc0.dist-info → replay_rec-0.18.1.dist-info}/METADATA +75 -70
  35. {replay_rec-0.18.0rc0.dist-info → replay_rec-0.18.1.dist-info}/RECORD +37 -84
  36. {replay_rec-0.18.0rc0.dist-info → replay_rec-0.18.1.dist-info}/WHEEL +1 -1
  37. replay/experimental/__init__.py +0 -0
  38. replay/experimental/metrics/__init__.py +0 -62
  39. replay/experimental/metrics/base_metric.py +0 -602
  40. replay/experimental/metrics/coverage.py +0 -97
  41. replay/experimental/metrics/experiment.py +0 -175
  42. replay/experimental/metrics/hitrate.py +0 -26
  43. replay/experimental/metrics/map.py +0 -30
  44. replay/experimental/metrics/mrr.py +0 -18
  45. replay/experimental/metrics/ncis_precision.py +0 -31
  46. replay/experimental/metrics/ndcg.py +0 -49
  47. replay/experimental/metrics/precision.py +0 -22
  48. replay/experimental/metrics/recall.py +0 -25
  49. replay/experimental/metrics/rocauc.py +0 -49
  50. replay/experimental/metrics/surprisal.py +0 -90
  51. replay/experimental/metrics/unexpectedness.py +0 -76
  52. replay/experimental/models/__init__.py +0 -10
  53. replay/experimental/models/admm_slim.py +0 -205
  54. replay/experimental/models/base_neighbour_rec.py +0 -204
  55. replay/experimental/models/base_rec.py +0 -1271
  56. replay/experimental/models/base_torch_rec.py +0 -234
  57. replay/experimental/models/cql.py +0 -454
  58. replay/experimental/models/ddpg.py +0 -923
  59. replay/experimental/models/dt4rec/__init__.py +0 -0
  60. replay/experimental/models/dt4rec/dt4rec.py +0 -189
  61. replay/experimental/models/dt4rec/gpt1.py +0 -401
  62. replay/experimental/models/dt4rec/trainer.py +0 -127
  63. replay/experimental/models/dt4rec/utils.py +0 -265
  64. replay/experimental/models/extensions/spark_custom_models/__init__.py +0 -0
  65. replay/experimental/models/extensions/spark_custom_models/als_extension.py +0 -792
  66. replay/experimental/models/implicit_wrap.py +0 -131
  67. replay/experimental/models/lightfm_wrap.py +0 -302
  68. replay/experimental/models/mult_vae.py +0 -332
  69. replay/experimental/models/neuromf.py +0 -406
  70. replay/experimental/models/scala_als.py +0 -296
  71. replay/experimental/nn/data/__init__.py +0 -1
  72. replay/experimental/nn/data/schema_builder.py +0 -55
  73. replay/experimental/preprocessing/__init__.py +0 -3
  74. replay/experimental/preprocessing/data_preparator.py +0 -839
  75. replay/experimental/preprocessing/padder.py +0 -229
  76. replay/experimental/preprocessing/sequence_generator.py +0 -208
  77. replay/experimental/scenarios/__init__.py +0 -1
  78. replay/experimental/scenarios/obp_wrapper/__init__.py +0 -8
  79. replay/experimental/scenarios/obp_wrapper/obp_optuna_objective.py +0 -74
  80. replay/experimental/scenarios/obp_wrapper/replay_offline.py +0 -248
  81. replay/experimental/scenarios/obp_wrapper/utils.py +0 -87
  82. replay/experimental/scenarios/two_stages/__init__.py +0 -0
  83. replay/experimental/scenarios/two_stages/reranker.py +0 -117
  84. replay/experimental/scenarios/two_stages/two_stages_scenario.py +0 -757
  85. replay/experimental/utils/__init__.py +0 -0
  86. replay/experimental/utils/logger.py +0 -24
  87. replay/experimental/utils/model_handler.py +0 -186
  88. replay/experimental/utils/session_handler.py +0 -44
  89. replay_rec-0.18.0rc0.dist-info/NOTICE +0 -41
  90. {replay_rec-0.18.0rc0.dist-info → replay_rec-0.18.1.dist-info}/LICENSE +0 -0
@@ -4,6 +4,8 @@ import numpy as np
4
4
  import torch
5
5
  from torch.utils.data import Dataset as TorchDataset
6
6
 
7
+ from replay.utils.model_handler import deprecation_warning
8
+
7
9
  from .schema import TensorFeatureInfo, TensorMap, TensorSchema
8
10
  from .sequential_dataset import SequentialDataset
9
11
 
@@ -25,6 +27,10 @@ class TorchSequentialDataset(TorchDataset):
25
27
  Torch dataset for sequential recommender models
26
28
  """
27
29
 
30
+ @deprecation_warning(
31
+ "`padding_value` parameter will be removed in future versions. "
32
+ "Instead, you should specify `padding_value` for each column in TensorSchema"
33
+ )
28
34
  def __init__(
29
35
  self,
30
36
  sequential: SequentialDataset,
@@ -90,15 +96,14 @@ class TorchSequentialDataset(TorchDataset):
90
96
  sequence = self._sequential.get_sequence(sequence_index, feature.name)
91
97
  if feature.is_seq:
92
98
  sequence = sequence[sequence_offset : sequence_offset + self._max_sequence_length]
93
-
94
- tensor_dtype = self._get_tensor_dtype(feature)
99
+ tensor_dtype = self._get_tensor_dtype(sequence)
95
100
  tensor_sequence = torch.tensor(sequence, dtype=tensor_dtype)
96
101
  if feature.is_seq:
97
- tensor_sequence = self._pad_sequence(tensor_sequence)
102
+ tensor_sequence = self._pad_sequence(tensor_sequence, feature.padding_value)
98
103
 
99
104
  return tensor_sequence
100
105
 
101
- def _pad_sequence(self, sequence: torch.Tensor) -> torch.Tensor:
106
+ def _pad_sequence(self, sequence: torch.Tensor, padding_value: int) -> torch.Tensor:
102
107
  assert len(sequence) <= self._max_sequence_length
103
108
  if len(sequence) == self._max_sequence_length:
104
109
  return sequence
@@ -115,16 +120,16 @@ class TorchSequentialDataset(TorchDataset):
115
120
 
116
121
  padded_sequence = torch.full(
117
122
  padded_sequence_shape,
118
- self._padding_value,
123
+ padding_value,
119
124
  dtype=sequence.dtype,
120
125
  )
121
126
  padded_sequence[-len(sequence) :].copy_(sequence)
122
127
  return padded_sequence
123
128
 
124
- def _get_tensor_dtype(self, feature: TensorFeatureInfo) -> torch.dtype:
125
- if feature.is_cat:
129
+ def _get_tensor_dtype(self, array: np.array) -> torch.dtype:
130
+ if np.issubdtype(array.dtype, np.integer):
126
131
  return torch.long
127
- if feature.is_num:
132
+ if np.issubdtype(array.dtype, np.floating):
128
133
  return torch.float32
129
134
  assert False, "Unknown tensor feature type"
130
135
 
@@ -170,6 +175,10 @@ class TorchSequentialValidationDataset(TorchDataset):
170
175
  Torch dataset for sequential recommender models that additionally stores ground truth
171
176
  """
172
177
 
178
+ @deprecation_warning(
179
+ "`padding_value` parameter will be removed in future versions. "
180
+ "Instead, you should specify `padding_value` for each column in TensorSchema"
181
+ )
173
182
  def __init__(
174
183
  self,
175
184
  sequential: SequentialDataset,
replay/data/nn/utils.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import Optional
1
+ from typing import Iterable, Optional
2
2
 
3
3
  import polars as pl
4
4
 
@@ -22,9 +22,12 @@ def groupby_sequences(events: DataFrameLike, groupby_col: str, sort_col: Optiona
22
22
  event_cols_without_groupby.remove(groupby_col)
23
23
 
24
24
  if sort_col:
25
- event_cols_without_groupby.remove(sort_col)
26
- event_cols_without_groupby.insert(0, sort_col)
27
- events = events.sort_values(event_cols_without_groupby)
25
+ event_cols_without_iterable = list(
26
+ filter(lambda x: not isinstance(events.iloc[0][x], Iterable), event_cols_without_groupby)
27
+ ) # deleting columns that cannot be sorted
28
+ event_cols_without_iterable.remove(sort_col)
29
+ event_cols_without_iterable.insert(0, sort_col)
30
+ events = events.sort_values(event_cols_without_iterable)
28
31
 
29
32
  grouped_sequences = (
30
33
  events.groupby(groupby_col).agg({col: list for col in event_cols_without_groupby}).reset_index()
@@ -34,9 +37,13 @@ def groupby_sequences(events: DataFrameLike, groupby_col: str, sort_col: Optiona
34
37
  event_cols_without_groupby.remove(groupby_col)
35
38
 
36
39
  if sort_col:
37
- event_cols_without_groupby.remove(sort_col)
38
- event_cols_without_groupby.insert(0, sort_col)
39
- events = events.sort(event_cols_without_groupby)
40
+ map_name2type = dict(zip(events.columns, events.dtypes))
41
+ event_cols_without_iterable = list(
42
+ filter(lambda x: not isinstance(map_name2type[x], pl.List), event_cols_without_groupby)
43
+ ) # deleting columns that cannot be sorted
44
+ event_cols_without_iterable.remove(sort_col)
45
+ event_cols_without_iterable.insert(0, sort_col)
46
+ events = events.sort(event_cols_without_iterable)
40
47
 
41
48
  grouped_sequences = events.group_by(groupby_col).agg(*[pl.col(x) for x in event_cols_without_groupby])
42
49
  else:
replay/data/schema.py CHANGED
@@ -20,7 +20,9 @@ class FeatureType(Enum):
20
20
  """Type of Feature."""
21
21
 
22
22
  CATEGORICAL = "categorical"
23
+ CATEGORICAL_LIST = "categorical_list"
23
24
  NUMERICAL = "numerical"
25
+ NUMERICAL_LIST = "numerical_list"
24
26
 
25
27
 
26
28
  class FeatureSource(Enum):
@@ -70,7 +72,7 @@ class FeatureInfo:
70
72
  self._feature_source = feature_source
71
73
  self._feature_hint = feature_hint
72
74
 
73
- if feature_type == FeatureType.NUMERICAL and cardinality:
75
+ if feature_type in [FeatureType.NUMERICAL, FeatureType.NUMERICAL_LIST] and cardinality:
74
76
  msg = "Cardinality is needed only with categorical feature_type."
75
77
  raise ValueError(msg)
76
78
  self._cardinality = cardinality
@@ -111,7 +113,7 @@ class FeatureInfo:
111
113
  """
112
114
  :returns: cardinality of the feature.
113
115
  """
114
- if self.feature_type != FeatureType.CATEGORICAL:
116
+ if self.feature_type not in [FeatureType.CATEGORICAL, FeatureType.CATEGORICAL_LIST]:
115
117
  msg = f"Can not get cardinality because feature_type of {self.column} column is not categorical."
116
118
  raise RuntimeError(msg)
117
119
  if hasattr(self, "_cardinality_callback") and self._cardinality is None:
@@ -143,7 +145,7 @@ class FeatureSchema(Mapping[str, FeatureInfo]):
143
145
 
144
146
  def copy(self) -> "FeatureSchema":
145
147
  """
146
- Creates a copy of all features.
148
+ Creates a copy of all features. For the returned copy, all cardinality values will be undefined.
147
149
 
148
150
  :returns: copy of the initial feature schema.
149
151
  """
@@ -227,14 +229,16 @@ class FeatureSchema(Mapping[str, FeatureInfo]):
227
229
  """
228
230
  :returns: sequence of categorical features in a schema.
229
231
  """
230
- return self.filter(feature_type=FeatureType.CATEGORICAL)
232
+ return self.filter(feature_type=FeatureType.CATEGORICAL) + self.filter(
233
+ feature_type=FeatureType.CATEGORICAL_LIST
234
+ )
231
235
 
232
236
  @property
233
237
  def numerical_features(self) -> "FeatureSchema":
234
238
  """
235
239
  :returns: sequence of numerical features in a schema.
236
240
  """
237
- return self.filter(feature_type=FeatureType.NUMERICAL)
241
+ return self.filter(feature_type=FeatureType.NUMERICAL) + self.filter(feature_type=FeatureType.NUMERICAL_LIST)
238
242
 
239
243
  @property
240
244
  def interaction_features(self) -> "FeatureSchema":
@@ -449,7 +453,7 @@ class FeatureSchema(Mapping[str, FeatureInfo]):
449
453
 
450
454
  if len(duplicates) > 0:
451
455
  msg = (
452
- "Features column names should be unique, exept ITEM_ID and QUERY_ID columns. "
456
+ "Features column names should be unique, except ITEM_ID and QUERY_ID columns. "
453
457
  f"{duplicates} columns are not unique."
454
458
  )
455
459
  raise ValueError(msg)
@@ -156,13 +156,13 @@ class OfflineMetrics:
156
156
  ):
157
157
  """
158
158
  :param metrics: (list of metrics): List of metrics to be calculated.
159
- :param user_column: (str): The name of the user column.
159
+ :param query_column:: (str): The name of the query column.
160
160
  Note that you do not need to specify the value of this parameter for each metric separately.
161
161
  It is enough to specify the value of this parameter here once.
162
162
  :param item_column: (str): The name of the item column.
163
163
  Note that you do not need to specify the value of this parameter for each metric separately.
164
164
  It is enough to specify the value of this parameter here once.
165
- :param score_column: (str): The name of the score column.
165
+ :param rating_column: (str): The name of the rating column.
166
166
  Note that you do not need to specify the value of this parameter for each metric separately.
167
167
  It is enough to specify the value of this parameter here once.
168
168
  :param category_column: (str): The name of the category column.
replay/models/__init__.py CHANGED
@@ -14,6 +14,7 @@ from .cat_pop_rec import CatPopRec
14
14
  from .cluster import ClusterRec
15
15
  from .kl_ucb import KLUCB
16
16
  from .knn import ItemKNN
17
+ from .lin_ucb import LinUCB
17
18
  from .pop_rec import PopRec
18
19
  from .query_pop_rec import QueryPopRec
19
20
  from .random_rec import RandomRec
replay/models/base_rec.py CHANGED
@@ -625,23 +625,21 @@ class BaseRecommender(RecommenderCommons, IsSavable, ABC):
625
625
  self, dataset: Dataset, k: int, queries: SparkDataFrame, items: SparkDataFrame, filter_seen_items: bool = True
626
626
  ) -> np.ndarray:
627
627
  """
628
- Inner method where model actually predicts.
628
+ Inner method where model actually predicts probability estimates.
629
+
630
+ Mainly used in ```OBPOfflinePolicyLearner```.
629
631
 
630
- :param log: historical log of interactions
632
+ :param dataset: historical interactions with query/item features
631
633
  ``[user_idx, item_idx, timestamp, rating]``
632
634
  :param k: number of recommendations for each user
633
- :param users: users to create recommendations for
635
+ :param queries: queries to create recommendations for
634
636
  dataframe containing ``[user_idx]`` or ``array-like``;
635
- if ``None``, recommend to all users from ``log``
637
+ if ``None``, recommend to all queries from ``interactions``
636
638
  :param items: candidate items for recommendations
637
639
  dataframe containing ``[item_idx]`` or ``array-like``;
638
- if ``None``, take all items from ``log``.
640
+ if ``None``, take all items from ``interactions``.
639
641
  If it contains new items, ``rating`` for them will be ``0``.
640
- :param user_features: user features
641
- ``[user_idx , timestamp]`` + feature columns
642
- :param item_features: item features
643
- ``[item_idx , timestamp]`` + feature columns
644
- :param filter_seen_items: flag to remove seen items from recommendations based on ``log``.
642
+ :param filter_seen_items: flag to remove seen items from recommendations based on ``interactions``.
645
643
  :return: distribution over items for each user with shape
646
644
  ``(n_users, n_items, k)``
647
645
  where we have probability for each user to choose item at fixed position(top-k).
@@ -1164,10 +1162,11 @@ class HybridRecommender(BaseRecommender, ABC):
1164
1162
  ) -> Optional[Tuple[SparkDataFrame, int]]:
1165
1163
  """
1166
1164
  Returns query or item feature vectors as a Column with type ArrayType
1165
+ If a model does not have a vector for some ids they are not present in the final result.
1166
+
1167
1167
  :param ids: Spark DataFrame with unique ids
1168
1168
  :param features: Spark DataFrame with features for provided ids
1169
1169
  :return: feature vectors
1170
- If a model does not have a vector for some ids they are not present in the final result.
1171
1170
  """
1172
1171
  return self._get_features_wrap(ids, features)
1173
1172
 
@@ -1644,23 +1643,21 @@ class NonPersonalizedRecommender(Recommender, ABC):
1644
1643
  self, dataset: Dataset, k: int, queries: SparkDataFrame, items: SparkDataFrame, filter_seen_items: bool = True
1645
1644
  ) -> np.ndarray:
1646
1645
  """
1647
- Inner method where model actually predicts.
1646
+ Inner method where model actually predicts probability estimates.
1647
+
1648
+ Mainly used in ```OBPOfflinePolicyLearner```.
1648
1649
 
1649
- :param log: historical log of interactions
1650
+ :param dataset: historical interactions with query/item features
1650
1651
  ``[user_idx, item_idx, timestamp, rating]``
1651
1652
  :param k: number of recommendations for each user
1652
- :param users: users to create recommendations for
1653
+ :param queries: queries to create recommendations for
1653
1654
  dataframe containing ``[user_idx]`` or ``array-like``;
1654
- if ``None``, recommend to all users from ``log``
1655
+ if ``None``, recommend to all queries from ``interactions``
1655
1656
  :param items: candidate items for recommendations
1656
1657
  dataframe containing ``[item_idx]`` or ``array-like``;
1657
- if ``None``, take all items from ``log``.
1658
+ if ``None``, take all items from ``interactions``.
1658
1659
  If it contains new items, ``rating`` for them will be ``0``.
1659
- :param user_features: user features
1660
- ``[user_idx , timestamp]`` + feature columns
1661
- :param item_features: item features
1662
- ``[item_idx , timestamp]`` + feature columns
1663
- :param filter_seen_items: flag to remove seen items from recommendations based on ``log``.
1660
+ :param filter_seen_items: flag to remove seen items from recommendations based on ``interactions``.
1664
1661
  :return: distribution over items for each user with shape
1665
1662
  ``(n_users, n_items, k)``
1666
1663
  where we have probability for each user to choose item at fixed position(top-k).