replay-rec 0.20.1rc0__py3-none-any.whl → 0.20.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. replay/__init__.py +1 -1
  2. replay/data/nn/sequential_dataset.py +8 -2
  3. {replay_rec-0.20.1rc0.dist-info → replay_rec-0.20.3.dist-info}/METADATA +18 -12
  4. {replay_rec-0.20.1rc0.dist-info → replay_rec-0.20.3.dist-info}/RECORD +7 -62
  5. replay/experimental/__init__.py +0 -0
  6. replay/experimental/metrics/__init__.py +0 -62
  7. replay/experimental/metrics/base_metric.py +0 -603
  8. replay/experimental/metrics/coverage.py +0 -97
  9. replay/experimental/metrics/experiment.py +0 -175
  10. replay/experimental/metrics/hitrate.py +0 -26
  11. replay/experimental/metrics/map.py +0 -30
  12. replay/experimental/metrics/mrr.py +0 -18
  13. replay/experimental/metrics/ncis_precision.py +0 -31
  14. replay/experimental/metrics/ndcg.py +0 -49
  15. replay/experimental/metrics/precision.py +0 -22
  16. replay/experimental/metrics/recall.py +0 -25
  17. replay/experimental/metrics/rocauc.py +0 -49
  18. replay/experimental/metrics/surprisal.py +0 -90
  19. replay/experimental/metrics/unexpectedness.py +0 -76
  20. replay/experimental/models/__init__.py +0 -50
  21. replay/experimental/models/admm_slim.py +0 -257
  22. replay/experimental/models/base_neighbour_rec.py +0 -200
  23. replay/experimental/models/base_rec.py +0 -1386
  24. replay/experimental/models/base_torch_rec.py +0 -234
  25. replay/experimental/models/cql.py +0 -454
  26. replay/experimental/models/ddpg.py +0 -932
  27. replay/experimental/models/dt4rec/__init__.py +0 -0
  28. replay/experimental/models/dt4rec/dt4rec.py +0 -189
  29. replay/experimental/models/dt4rec/gpt1.py +0 -401
  30. replay/experimental/models/dt4rec/trainer.py +0 -127
  31. replay/experimental/models/dt4rec/utils.py +0 -264
  32. replay/experimental/models/extensions/spark_custom_models/__init__.py +0 -0
  33. replay/experimental/models/extensions/spark_custom_models/als_extension.py +0 -792
  34. replay/experimental/models/hierarchical_recommender.py +0 -331
  35. replay/experimental/models/implicit_wrap.py +0 -131
  36. replay/experimental/models/lightfm_wrap.py +0 -303
  37. replay/experimental/models/mult_vae.py +0 -332
  38. replay/experimental/models/neural_ts.py +0 -986
  39. replay/experimental/models/neuromf.py +0 -406
  40. replay/experimental/models/scala_als.py +0 -293
  41. replay/experimental/models/u_lin_ucb.py +0 -115
  42. replay/experimental/nn/data/__init__.py +0 -1
  43. replay/experimental/nn/data/schema_builder.py +0 -102
  44. replay/experimental/preprocessing/__init__.py +0 -3
  45. replay/experimental/preprocessing/data_preparator.py +0 -839
  46. replay/experimental/preprocessing/padder.py +0 -229
  47. replay/experimental/preprocessing/sequence_generator.py +0 -208
  48. replay/experimental/scenarios/__init__.py +0 -1
  49. replay/experimental/scenarios/obp_wrapper/__init__.py +0 -8
  50. replay/experimental/scenarios/obp_wrapper/obp_optuna_objective.py +0 -74
  51. replay/experimental/scenarios/obp_wrapper/replay_offline.py +0 -261
  52. replay/experimental/scenarios/obp_wrapper/utils.py +0 -85
  53. replay/experimental/scenarios/two_stages/__init__.py +0 -0
  54. replay/experimental/scenarios/two_stages/reranker.py +0 -117
  55. replay/experimental/scenarios/two_stages/two_stages_scenario.py +0 -757
  56. replay/experimental/utils/__init__.py +0 -0
  57. replay/experimental/utils/logger.py +0 -24
  58. replay/experimental/utils/model_handler.py +0 -186
  59. replay/experimental/utils/session_handler.py +0 -44
  60. {replay_rec-0.20.1rc0.dist-info → replay_rec-0.20.3.dist-info}/WHEEL +0 -0
  61. {replay_rec-0.20.1rc0.dist-info → replay_rec-0.20.3.dist-info}/licenses/LICENSE +0 -0
  62. {replay_rec-0.20.1rc0.dist-info → replay_rec-0.20.3.dist-info}/licenses/NOTICE +0 -0
@@ -1,175 +0,0 @@
1
- from typing import Any, Optional
2
-
3
- from replay.utils import DataFrameLike, IntOrList, NumType, PandasDataFrame
4
- from replay.utils.spark_utils import convert2spark
5
-
6
- from .base_metric import Metric, NCISMetric, RecOnlyMetric, get_enriched_recommendations
7
-
8
-
9
- class Experiment:
10
- """
11
- This class calculates and stores metric values.
12
- Initialize it with test data and a dictionary mapping metrics to their depth cut-offs.
13
-
14
- Results are available with ``pandas_df`` attribute.
15
-
16
- Example:
17
-
18
- >>> import pandas as pd
19
- >>> from replay.experimental.metrics import Coverage, NDCG, Precision, Surprisal
20
- >>> log = pd.DataFrame({"user_idx": [2, 2, 2, 1], "item_idx": [1, 2, 3, 3], "relevance": [5, 5, 5, 5]})
21
- >>> test = pd.DataFrame({"user_idx": [1, 1, 1], "item_idx": [1, 2, 3], "relevance": [5, 3, 4]})
22
- >>> pred = pd.DataFrame({"user_idx": [1, 1, 1], "item_idx": [4, 1, 3], "relevance": [5, 4, 5]})
23
- >>> recs = pd.DataFrame({"user_idx": [1, 1, 1], "item_idx": [1, 4, 5], "relevance": [5, 4, 5]})
24
- >>> ex = Experiment(test, {NDCG(): [2, 3], Surprisal(log): 3})
25
- >>> ex.add_result("baseline", recs)
26
- >>> ex.add_result("baseline_gt_users", recs, ground_truth_users=pd.DataFrame({"user_idx": [1, 3]}))
27
- >>> ex.add_result("model", pred)
28
- >>> ex.results
29
- NDCG@2 NDCG@3 Surprisal@3
30
- baseline 0.386853 0.296082 1.000000
31
- baseline_gt_users 0.193426 0.148041 0.500000
32
- model 0.386853 0.530721 0.666667
33
- >>> ex.compare("baseline")
34
- NDCG@2 NDCG@3 Surprisal@3
35
- baseline – – –
36
- baseline_gt_users -50.0% -50.0% -50.0%
37
- model 0.0% 79.25% -33.33%
38
- >>> ex = Experiment(test, {Precision(): [3]}, calc_median=True, calc_conf_interval=0.95)
39
- >>> ex.add_result("baseline", recs)
40
- >>> ex.add_result("model", pred)
41
- >>> ex.results
42
- Precision@3 Precision@3_median Precision@3_0.95_conf_interval
43
- baseline 0.333333 0.333333 0.0
44
- model 0.666667 0.666667 0.0
45
- >>> ex = Experiment(test, {Coverage(log): 3}, calc_median=True, calc_conf_interval=0.95)
46
- >>> ex.add_result("baseline", recs)
47
- >>> ex.add_result("model", pred)
48
- >>> ex.results
49
- Coverage@3 Coverage@3_median Coverage@3_0.95_conf_interval
50
- baseline 1.0 1.0 0.0
51
- model 1.0 1.0 0.0
52
- """
53
-
54
- def __init__(
55
- self,
56
- test: Any,
57
- metrics: dict[Metric, IntOrList],
58
- calc_median: bool = False,
59
- calc_conf_interval: Optional[float] = None,
60
- ):
61
- """
62
- :param test: test DataFrame
63
- :param metrics: Dictionary of metrics to calculate.
64
- Key -- metric, value -- ``int`` or a list of ints.
65
- :param calc_median: flag to calculate median value across users
66
- :param calc_conf_interval: quantile value for the calculation of the confidence interval.
67
- Resulting value is the half of confidence interval.
68
- """
69
- self.test = convert2spark(test)
70
- self.results = PandasDataFrame()
71
- self.metrics = metrics
72
- self.calc_median = calc_median
73
- self.calc_conf_interval = calc_conf_interval
74
-
75
- def add_result(
76
- self,
77
- name: str,
78
- pred: DataFrameLike,
79
- ground_truth_users: Optional[DataFrameLike] = None,
80
- ) -> None:
81
- """
82
- Calculate metrics for predictions
83
-
84
- :param name: name of the run to store in the resulting DataFrame
85
- :param pred: model recommendations
86
- :param ground_truth_users: list of users to consider in metric calculation.
87
- if None, only the users from ground_truth are considered.
88
- """
89
- max_k = 0
90
- for current_k in self.metrics.values():
91
- max_k = max((*current_k, max_k) if isinstance(current_k, list) else (current_k, max_k))
92
-
93
- recs = get_enriched_recommendations(pred, self.test, max_k, ground_truth_users).cache()
94
- for metric, k_list in sorted(self.metrics.items(), key=lambda x: str(x[0])):
95
- enriched = None
96
- if isinstance(metric, (RecOnlyMetric, NCISMetric)):
97
- enriched = metric._get_enriched_recommendations(pred, self.test, max_k, ground_truth_users)
98
- values, median, conf_interval = self._calculate(metric, enriched or recs, k_list)
99
-
100
- if isinstance(k_list, int):
101
- self._add_metric(
102
- name,
103
- metric,
104
- k_list,
105
- values,
106
- median,
107
- conf_interval,
108
- )
109
- else:
110
- for k, val in sorted(values.items(), key=lambda x: x[0]):
111
- self._add_metric(
112
- name,
113
- metric,
114
- k,
115
- val,
116
- None if median is None else median[k],
117
- None if conf_interval is None else conf_interval[k],
118
- )
119
- recs.unpersist()
120
-
121
- def _calculate(self, metric, enriched, k_list):
122
- median = None
123
- conf_interval = None
124
- values = metric._mean(enriched, k_list)
125
- if self.calc_median:
126
- median = metric._median(enriched, k_list)
127
- if self.calc_conf_interval is not None:
128
- conf_interval = metric._conf_interval(enriched, k_list, self.calc_conf_interval)
129
- return values, median, conf_interval
130
-
131
- def _add_metric(
132
- self,
133
- name: str,
134
- metric: Metric,
135
- k: int,
136
- value: NumType,
137
- median: Optional[NumType],
138
- conf_interval: Optional[NumType],
139
- ):
140
- """
141
- Add metric for a specific k
142
-
143
- :param name: name to save results
144
- :param metric: metric object
145
- :param k: length of the recommendation list
146
- :param value: metric value
147
- :param median: median value
148
- :param conf_interval: confidence interval value
149
- """
150
- self.results.at[name, f"{metric}@{k}"] = value
151
- if median is not None:
152
- self.results.at[name, f"{metric}@{k}_median"] = median
153
- if conf_interval is not None:
154
- self.results.at[name, f"{metric}@{k}_{self.calc_conf_interval}_conf_interval"] = conf_interval
155
-
156
- def compare(self, name: str) -> PandasDataFrame:
157
- """
158
- Show results as a percentage difference to record ``name``.
159
-
160
- :param name: name of the baseline record
161
- :return: results table in a percentage format
162
- """
163
- if name not in self.results.index:
164
- msg = f"No results for model {name}"
165
- raise ValueError(msg)
166
- columns = [column for column in self.results.columns if column[-1].isdigit()]
167
- data_frame = self.results[columns].copy()
168
- baseline = data_frame.loc[name]
169
- for idx in data_frame.index:
170
- if idx != name:
171
- diff = data_frame.loc[idx] / baseline - 1
172
- data_frame.loc[idx] = [str(round(v * 100, 2)) + "%" for v in diff]
173
- else:
174
- data_frame.loc[name] = ["–"] * len(baseline)
175
- return data_frame
@@ -1,26 +0,0 @@
1
- from .base_metric import Metric
2
-
3
-
4
- class HitRate(Metric):
5
- """
6
- Percentage of users that have at least one
7
- correctly recommended item among top-k.
8
-
9
- .. math::
10
- HitRate@K(i) = \\max_{j \\in [1..K]}\\mathbb{1}_{r_{ij}}
11
-
12
- .. math::
13
- HitRate@K = \\frac {\\sum_{i=1}^{N}HitRate@K(i)}{N}
14
-
15
- :math:`\\mathbb{1}_{r_{ij}}` -- indicator function stating that user :math:`i` interacted with item :math:`j`
16
-
17
- """
18
-
19
- _scala_udf_name = "getHitRateMetricValue"
20
-
21
- @staticmethod
22
- def _get_metric_value_by_user(k, pred, ground_truth) -> float:
23
- for i in pred[:k]:
24
- if i in ground_truth:
25
- return 1
26
- return 0
@@ -1,30 +0,0 @@
1
- from .base_metric import Metric
2
-
3
-
4
- class MAP(Metric):
5
- """
6
- Mean Average Precision -- average the ``Precision`` at relevant positions for each user,
7
- and then calculate the mean across all users.
8
-
9
- .. math::
10
- &AP@K(i) = \\frac 1K \\sum_{j=1}^{K}\\mathbb{1}_{r_{ij}}Precision@j(i)
11
-
12
- &MAP@K = \\frac {\\sum_{i=1}^{N}AP@K(i)}{N}
13
-
14
- :math:`\\mathbb{1}_{r_{ij}}` -- indicator function showing if user :math:`i` interacted with item :math:`j`
15
- """
16
-
17
- _scala_udf_name = "getMAPMetricValue"
18
-
19
- @staticmethod
20
- def _get_metric_value_by_user(k, pred, ground_truth) -> float:
21
- length = min(k, len(pred))
22
- if len(ground_truth) == 0 or len(pred) == 0:
23
- return 0
24
- tp_cum = 0
25
- result = 0
26
- for i in range(length):
27
- if pred[i] in ground_truth:
28
- tp_cum += 1
29
- result += tp_cum / (i + 1)
30
- return result / k
@@ -1,18 +0,0 @@
1
- from .base_metric import Metric
2
-
3
-
4
- class MRR(Metric):
5
- """
6
- Mean Reciprocal Rank --
7
- Reciprocal Rank is the inverse position of the first relevant item among top-k recommendations,
8
- :math:`\\frac {1}{rank_i}`. This value is averaged by all users.
9
- """
10
-
11
- _scala_udf_name = "getMRRMetricValue"
12
-
13
- @staticmethod
14
- def _get_metric_value_by_user(k, pred, ground_truth) -> float:
15
- for i in range(min(k, len(pred))):
16
- if pred[i] in ground_truth:
17
- return 1 / (1 + i)
18
- return 0
@@ -1,31 +0,0 @@
1
- import numpy as np
2
-
3
- from .base_metric import NCISMetric
4
-
5
-
6
- class NCISPrecision(NCISMetric):
7
- """
8
- Share of relevant items among top ``K`` recommendations with NCIS weighting.
9
-
10
- .. math::
11
- Precision@K(i) = \\frac {\\sum_{j=1}^{K}\\mathbb{1}_{r_{ij} w_{ij}}}{\\sum_{j=1}^{K} w_{ij}}
12
-
13
- .. math::
14
- Precision@K = \\frac {\\sum_{i=1}^{N}Precision@K(i)}{N}
15
-
16
- :math:`\\mathbb{1}_{r_{ij}}` -- indicator function
17
- showing that user :math:`i` interacted with item :math:`j`
18
- :math:`w_{ij}` -- NCIS weight, calculated as ratio of current policy score on previous
19
- policy score with clipping and optional activation over policy scores (relevance).
20
- Source: arxiv.org/abs/1801.07030
21
- """
22
-
23
- _scala_udf_name = "getNCISPrecisionMetricValue"
24
-
25
- @staticmethod
26
- def _get_metric_value_by_user(k, *args):
27
- pred, ground_truth, pred_weights = args
28
- if len(pred) == 0 or len(ground_truth) == 0:
29
- return 0
30
- mask = np.isin(pred[:k], ground_truth)
31
- return sum(np.array(pred_weights)[mask]) / sum(pred_weights[:k])
@@ -1,49 +0,0 @@
1
- import math
2
-
3
- from .base_metric import Metric
4
-
5
-
6
- class NDCG(Metric):
7
- """
8
- Normalized Discounted Cumulative Gain is a metric
9
- that takes into account positions of relevant items.
10
-
11
- This is the binary version, it takes into account
12
- whether the item was consumed or not, relevance value is ignored.
13
-
14
- .. math::
15
- DCG@K(i) = \\sum_{j=1}^{K}\\frac{\\mathbb{1}_{r_{ij}}}{\\log_2 (j+1)}
16
-
17
-
18
- :math:`\\mathbb{1}_{r_{ij}}` -- indicator function showing that user :math:`i` interacted with item :math:`j`
19
-
20
- To get from :math:`DCG` to :math:`nDCG` we calculate the biggest possible value of `DCG`
21
- for user :math:`i` and recommendation length :math:`K`.
22
-
23
- .. math::
24
- IDCG@K(i) = max(DCG@K(i)) = \\sum_{j=1}^{K}\\frac{\\mathbb{1}_{j\\le|Rel_i|}}{\\log_2 (j+1)}
25
-
26
- .. math::
27
- nDCG@K(i) = \\frac {DCG@K(i)}{IDCG@K(i)}
28
-
29
- :math:`|Rel_i|` -- number of relevant items for user :math:`i`
30
-
31
- Metric is averaged by users.
32
-
33
- .. math::
34
- nDCG@K = \\frac {\\sum_{i=1}^{N}nDCG@K(i)}{N}
35
- """
36
-
37
- _scala_udf_name = "getNDCGMetricValue"
38
-
39
- @staticmethod
40
- def _get_metric_value_by_user(k, pred, ground_truth) -> float:
41
- if len(pred) == 0 or len(ground_truth) == 0:
42
- return 0.0
43
- pred_len = min(k, len(pred))
44
- ground_truth_len = min(k, len(ground_truth))
45
- denom = [1 / math.log2(i + 2) for i in range(k)]
46
- dcg = sum(denom[i] for i in range(pred_len) if pred[i] in ground_truth)
47
- idcg = sum(denom[:ground_truth_len])
48
-
49
- return dcg / idcg
@@ -1,22 +0,0 @@
1
- from .base_metric import Metric
2
-
3
-
4
- class Precision(Metric):
5
- """
6
- Mean percentage of relevant items among top ``K`` recommendations.
7
-
8
- .. math::
9
- Precision@K(i) = \\frac {\\sum_{j=1}^{K}\\mathbb{1}_{r_{ij}}}{K}
10
-
11
- .. math::
12
- Precision@K = \\frac {\\sum_{i=1}^{N}Precision@K(i)}{N}
13
-
14
- :math:`\\mathbb{1}_{r_{ij}}` -- indicator function showing that user :math:`i` interacted with item :math:`j`"""
15
-
16
- _scala_udf_name = "getPrecisionMetricValue"
17
-
18
- @staticmethod
19
- def _get_metric_value_by_user(k, pred, ground_truth) -> float:
20
- if len(pred) == 0:
21
- return 0
22
- return len(set(pred[:k]) & set(ground_truth)) / k
@@ -1,25 +0,0 @@
1
- from .base_metric import Metric
2
-
3
-
4
- class Recall(Metric):
5
- """
6
- Mean percentage of relevant items, that was shown among top ``K`` recommendations.
7
-
8
- .. math::
9
- Recall@K(i) = \\frac {\\sum_{j=1}^{K}\\mathbb{1}_{r_{ij}}}{|Rel_i|}
10
-
11
- .. math::
12
- Recall@K = \\frac {\\sum_{i=1}^{N}Recall@K(i)}{N}
13
-
14
- :math:`\\mathbb{1}_{r_{ij}}` -- indicator function showing that user :math:`i` interacted with item :math:`j`
15
-
16
- :math:`|Rel_i|` -- the number of relevant items for user :math:`i`
17
- """
18
-
19
- _scala_udf_name = "getRecallMetricValue"
20
-
21
- @staticmethod
22
- def _get_metric_value_by_user(k, pred, ground_truth) -> float:
23
- if len(ground_truth) == 0:
24
- return 0.0
25
- return len(set(pred[:k]) & set(ground_truth)) / len(ground_truth)
@@ -1,49 +0,0 @@
1
- from .base_metric import Metric
2
-
3
-
4
- class RocAuc(Metric):
5
- """
6
- Receiver Operating Characteristic/Area Under the Curve is the aggregated performance measure,
7
- that depends only on the order of recommended items.
8
- It can be interpreted as the fraction of object pairs (object of class 1, object of class 0)
9
- that were correctly ordered by the model.
10
- The bigger the value of AUC, the better the classification model.
11
-
12
- .. math::
13
- ROCAUC@K(i) = \\frac {\\sum_{s=1}^{K}\\sum_{t=1}^{K}
14
- \\mathbb{1}_{r_{si}<r_{ti}}
15
- \\mathbb{1}_{gt_{si}<gt_{ti}}}
16
- {\\sum_{s=1}^{K}\\sum_{t=1}^{K} \\mathbb{1}_{gt_{si}<gt_{tj}}}
17
-
18
- :math:`\\mathbb{1}_{r_{si}<r_{ti}}` -- indicator function showing that recommendation score for
19
- user :math:`i` for item :math:`s` is bigger than for item :math:`t`
20
-
21
- :math:`\\mathbb{1}_{gt_{si}<gt_{ti}}` -- indicator function showing that
22
- user :math:`i` values item :math:`s` more than item :math:`t`.
23
-
24
- Metric is averaged by all users.
25
-
26
- .. math::
27
- ROCAUC@K = \\frac {\\sum_{i=1}^{N}ROCAUC@K(i)}{N}
28
- """
29
-
30
- _scala_udf_name = "getRocAucMetricValue"
31
-
32
- @staticmethod
33
- def _get_metric_value_by_user(k, pred, ground_truth) -> float:
34
- length = min(k, len(pred))
35
- if len(ground_truth) == 0 or len(pred) == 0:
36
- return 0
37
-
38
- fp_cur = 0
39
- fp_cum = 0
40
- for item in pred[:length]:
41
- if item in ground_truth:
42
- fp_cum += fp_cur
43
- else:
44
- fp_cur += 1
45
- if fp_cur == length:
46
- return 0
47
- if fp_cum == 0:
48
- return 1
49
- return 1 - fp_cum / (fp_cur * (length - fp_cur))
@@ -1,90 +0,0 @@
1
- from typing import Optional
2
-
3
- import numpy as np
4
-
5
- from replay.utils import PYSPARK_AVAILABLE, DataFrameLike, SparkDataFrame
6
- from replay.utils.spark_utils import convert2spark, get_top_k_recs
7
-
8
- from .base_metric import RecOnlyMetric, fill_na_with_empty_array, filter_sort
9
-
10
- if PYSPARK_AVAILABLE:
11
- from pyspark.sql import (
12
- functions as sf,
13
- types as st,
14
- )
15
-
16
-
17
- class Surprisal(RecOnlyMetric):
18
- """
19
- Measures how many surprising rare items are present in recommendations.
20
-
21
- .. math::
22
- \\textit{Self-Information}(j)= -\\log_2 \\frac {u_j}{N}
23
-
24
- :math:`u_j` -- number of users that interacted with item :math:`j`.
25
- Cold items are treated as if they were rated by 1 user.
26
- That is, if they appear in recommendations it will be completely unexpected.
27
-
28
- Metric is normalized.
29
-
30
- Surprisal for item :math:`j` is
31
-
32
- .. math::
33
- Surprisal(j)= \\frac {\\textit{Self-Information}(j)}{log_2 N}
34
-
35
- Recommendation list surprisal is the average surprisal of items in it.
36
-
37
- .. math::
38
- Surprisal@K(i) = \\frac {\\sum_{j=1}^{K}Surprisal(j)} {K}
39
-
40
- Final metric is averaged by users.
41
-
42
- .. math::
43
- Surprisal@K = \\frac {\\sum_{i=1}^{N}Surprisal@K(i)}{N}
44
- """
45
-
46
- _scala_udf_name = "getSurprisalMetricValue"
47
-
48
- def __init__(self, log: DataFrameLike, use_scala_udf: bool = False):
49
- """
50
- Here we calculate self-information for each item
51
-
52
- :param log: historical data
53
- """
54
- self._use_scala_udf = use_scala_udf
55
- self.log = convert2spark(log)
56
- n_users = self.log.select("user_idx").distinct().count()
57
- self.item_weights = self.log.groupby("item_idx").agg(
58
- (sf.log2(n_users / sf.countDistinct("user_idx")) / np.log2(n_users)).alias("rec_weight")
59
- )
60
-
61
- @staticmethod
62
- def _get_metric_value_by_user(k, *args):
63
- weigths = args[0]
64
- return sum(weigths[:k]) / k
65
-
66
- def _get_enriched_recommendations(
67
- self,
68
- recommendations: SparkDataFrame,
69
- ground_truth: SparkDataFrame, # noqa: ARG002
70
- max_k: int,
71
- ground_truth_users: Optional[DataFrameLike] = None,
72
- ) -> SparkDataFrame:
73
- recommendations = convert2spark(recommendations)
74
- ground_truth_users = convert2spark(ground_truth_users)
75
- recommendations = get_top_k_recs(recommendations, max_k)
76
-
77
- recommendations = recommendations.join(self.item_weights, on="item_idx", how="left").fillna(1.0)
78
- recommendations = filter_sort(recommendations, "rec_weight")
79
- recommendations = recommendations.select("user_idx", sf.col("rec_weight")).withColumn(
80
- "rec_weight",
81
- sf.col("rec_weight").cast(st.ArrayType(st.DoubleType(), True)),
82
- )
83
- if ground_truth_users is not None:
84
- recommendations = fill_na_with_empty_array(
85
- recommendations.join(ground_truth_users, on="user_idx", how="right"),
86
- "rec_weight",
87
- self.item_weights.schema["rec_weight"].dataType,
88
- )
89
-
90
- return recommendations
@@ -1,76 +0,0 @@
1
- from typing import Optional
2
-
3
- from replay.utils import DataFrameLike, SparkDataFrame
4
- from replay.utils.spark_utils import convert2spark, get_top_k_recs
5
-
6
- from .base_metric import RecOnlyMetric, fill_na_with_empty_array, filter_sort
7
-
8
-
9
- class Unexpectedness(RecOnlyMetric):
10
- """
11
- Fraction of recommended items that are not present in some baseline recommendations.
12
-
13
- >>> import pandas as pd
14
- >>> from replay.utils.session_handler import get_spark_session, State
15
- >>> spark = get_spark_session(1, 1)
16
- >>> state = State(spark)
17
-
18
- >>> log = pd.DataFrame({
19
- ... "user_idx": [1, 1, 1],
20
- ... "item_idx": [1, 2, 3],
21
- ... "relevance": [5, 5, 5],
22
- ... "timestamp": [1, 1, 1],
23
- ... })
24
- >>> recs = pd.DataFrame({
25
- ... "user_idx": [1, 1, 1],
26
- ... "item_idx": [0, 0, 1],
27
- ... "relevance": [5, 5, 5],
28
- ... "timestamp": [1, 1, 1],
29
- ... })
30
- >>> metric = Unexpectedness(log)
31
- >>> round(metric(recs, 3), 2)
32
- 0.67
33
- """
34
-
35
- _scala_udf_name = "getUnexpectednessMetricValue"
36
-
37
- def __init__(self, pred: DataFrameLike, use_scala_udf: bool = False):
38
- """
39
- :param pred: model predictions
40
- """
41
- self._use_scala_udf = use_scala_udf
42
- self.pred = convert2spark(pred)
43
-
44
- @staticmethod
45
- def _get_metric_value_by_user(k, *args) -> float:
46
- pred = args[0]
47
- base_pred = args[1]
48
- if len(pred) == 0:
49
- return 0
50
- return 1.0 - len(set(pred[:k]) & set(base_pred[:k])) / k
51
-
52
- def _get_enriched_recommendations(
53
- self,
54
- recommendations: SparkDataFrame,
55
- ground_truth: SparkDataFrame, # noqa: ARG002
56
- max_k: int,
57
- ground_truth_users: Optional[DataFrameLike] = None,
58
- ) -> SparkDataFrame:
59
- recommendations = convert2spark(recommendations)
60
- ground_truth_users = convert2spark(ground_truth_users)
61
- base_pred = self.pred
62
-
63
- # TO DO: preprocess base_recs once in __init__
64
-
65
- base_recs = filter_sort(base_pred).withColumnRenamed("pred", "base_pred")
66
-
67
- # if there are duplicates in recommendations,
68
- # we will leave fewer than k recommendations after sort_udf
69
- recommendations = get_top_k_recs(recommendations, k=max_k)
70
- recommendations = filter_sort(recommendations)
71
- recommendations = recommendations.join(base_recs, how="right", on=["user_idx"])
72
-
73
- if ground_truth_users is not None:
74
- recommendations = recommendations.join(ground_truth_users, on="user_idx", how="right")
75
-
76
- return fill_na_with_empty_array(recommendations, "pred", base_pred.schema["item_idx"].dataType)
@@ -1,50 +0,0 @@
1
- from typing import Any
2
-
3
- from replay.experimental.models.admm_slim import ADMMSLIM
4
- from replay.experimental.models.base_torch_rec import TorchRecommender
5
- from replay.experimental.models.cql import CQL
6
- from replay.experimental.models.ddpg import DDPG
7
- from replay.experimental.models.dt4rec.dt4rec import DT4Rec
8
- from replay.experimental.models.hierarchical_recommender import HierarchicalRecommender
9
- from replay.experimental.models.implicit_wrap import ImplicitWrap
10
- from replay.experimental.models.mult_vae import MultVAE
11
- from replay.experimental.models.neural_ts import NeuralTS
12
- from replay.experimental.models.neuromf import NeuroMF
13
- from replay.experimental.models.scala_als import ScalaALSWrap
14
- from replay.experimental.models.u_lin_ucb import ULinUCB
15
-
16
- __all__ = [
17
- "ADMMSLIM",
18
- "CQL",
19
- "DDPG",
20
- "DT4Rec",
21
- "HierarchicalRecommender",
22
- "ImplicitWrap",
23
- "MultVAE",
24
- "NeuralTS",
25
- "NeuroMF",
26
- "ScalaALSWrap",
27
- "TorchRecommender",
28
- "ULinUCB",
29
- ]
30
-
31
- CONDITIONAL_IMPORTS = {"LightFMWrap": "replay.experimental.models.lightfm_wrap"}
32
-
33
-
34
- class ConditionalAccessError(Exception):
35
- """Raised when trying to access conditional elements from parent module instead of a direct import."""
36
-
37
-
38
- def __getattr__(name: str) -> Any:
39
- if name in CONDITIONAL_IMPORTS:
40
- msg = (
41
- f"{name} relies on manual dependency installation and cannot be accessed via higher-level modules. "
42
- f"If you wish to use this attribute, import it directly from {CONDITIONAL_IMPORTS[name]}"
43
- )
44
-
45
- raise ConditionalAccessError(msg)
46
-
47
- if name in __all__:
48
- return globals()[name]
49
- msg = f"module {__name__!r} has no attribute {name!r}"
50
- raise AttributeError(msg)