replay-rec 0.20.2__py3-none-any.whl → 0.20.3rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- replay/__init__.py +1 -1
- replay/data/nn/sequential_dataset.py +8 -2
- replay/experimental/__init__.py +0 -0
- replay/experimental/metrics/__init__.py +62 -0
- replay/experimental/metrics/base_metric.py +603 -0
- replay/experimental/metrics/coverage.py +97 -0
- replay/experimental/metrics/experiment.py +175 -0
- replay/experimental/metrics/hitrate.py +26 -0
- replay/experimental/metrics/map.py +30 -0
- replay/experimental/metrics/mrr.py +18 -0
- replay/experimental/metrics/ncis_precision.py +31 -0
- replay/experimental/metrics/ndcg.py +49 -0
- replay/experimental/metrics/precision.py +22 -0
- replay/experimental/metrics/recall.py +25 -0
- replay/experimental/metrics/rocauc.py +49 -0
- replay/experimental/metrics/surprisal.py +90 -0
- replay/experimental/metrics/unexpectedness.py +76 -0
- replay/experimental/models/__init__.py +50 -0
- replay/experimental/models/admm_slim.py +257 -0
- replay/experimental/models/base_neighbour_rec.py +200 -0
- replay/experimental/models/base_rec.py +1386 -0
- replay/experimental/models/base_torch_rec.py +234 -0
- replay/experimental/models/cql.py +454 -0
- replay/experimental/models/ddpg.py +932 -0
- replay/experimental/models/dt4rec/__init__.py +0 -0
- replay/experimental/models/dt4rec/dt4rec.py +189 -0
- replay/experimental/models/dt4rec/gpt1.py +401 -0
- replay/experimental/models/dt4rec/trainer.py +127 -0
- replay/experimental/models/dt4rec/utils.py +264 -0
- replay/experimental/models/extensions/spark_custom_models/__init__.py +0 -0
- replay/experimental/models/extensions/spark_custom_models/als_extension.py +792 -0
- replay/experimental/models/hierarchical_recommender.py +331 -0
- replay/experimental/models/implicit_wrap.py +131 -0
- replay/experimental/models/lightfm_wrap.py +303 -0
- replay/experimental/models/mult_vae.py +332 -0
- replay/experimental/models/neural_ts.py +986 -0
- replay/experimental/models/neuromf.py +406 -0
- replay/experimental/models/scala_als.py +293 -0
- replay/experimental/models/u_lin_ucb.py +115 -0
- replay/experimental/nn/data/__init__.py +1 -0
- replay/experimental/nn/data/schema_builder.py +102 -0
- replay/experimental/preprocessing/__init__.py +3 -0
- replay/experimental/preprocessing/data_preparator.py +839 -0
- replay/experimental/preprocessing/padder.py +229 -0
- replay/experimental/preprocessing/sequence_generator.py +208 -0
- replay/experimental/scenarios/__init__.py +1 -0
- replay/experimental/scenarios/obp_wrapper/__init__.py +8 -0
- replay/experimental/scenarios/obp_wrapper/obp_optuna_objective.py +74 -0
- replay/experimental/scenarios/obp_wrapper/replay_offline.py +261 -0
- replay/experimental/scenarios/obp_wrapper/utils.py +85 -0
- replay/experimental/scenarios/two_stages/__init__.py +0 -0
- replay/experimental/scenarios/two_stages/reranker.py +117 -0
- replay/experimental/scenarios/two_stages/two_stages_scenario.py +757 -0
- replay/experimental/utils/__init__.py +0 -0
- replay/experimental/utils/logger.py +24 -0
- replay/experimental/utils/model_handler.py +186 -0
- replay/experimental/utils/session_handler.py +44 -0
- {replay_rec-0.20.2.dist-info → replay_rec-0.20.3rc0.dist-info}/METADATA +11 -17
- {replay_rec-0.20.2.dist-info → replay_rec-0.20.3rc0.dist-info}/RECORD +62 -7
- {replay_rec-0.20.2.dist-info → replay_rec-0.20.3rc0.dist-info}/WHEEL +0 -0
- {replay_rec-0.20.2.dist-info → replay_rec-0.20.3rc0.dist-info}/licenses/LICENSE +0 -0
- {replay_rec-0.20.2.dist-info → replay_rec-0.20.3rc0.dist-info}/licenses/NOTICE +0 -0
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
from typing import Optional, Union
|
|
2
|
+
|
|
3
|
+
from replay.utils import PYSPARK_AVAILABLE, DataFrameLike, IntOrList, NumType, SparkDataFrame
|
|
4
|
+
from replay.utils.spark_utils import convert2spark
|
|
5
|
+
|
|
6
|
+
from .base_metric import RecOnlyMetric, process_k
|
|
7
|
+
|
|
8
|
+
if PYSPARK_AVAILABLE:
|
|
9
|
+
from pyspark.sql import (
|
|
10
|
+
Window,
|
|
11
|
+
functions as sf,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Coverage(RecOnlyMetric):
|
|
16
|
+
"""
|
|
17
|
+
Metric calculation is as follows:
|
|
18
|
+
|
|
19
|
+
* take ``K`` recommendations with the biggest ``relevance`` for each ``user_id``
|
|
20
|
+
* count the number of distinct ``item_id`` in these recommendations
|
|
21
|
+
* divide it by the number of items in the whole data set
|
|
22
|
+
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self, log: DataFrameLike):
|
|
26
|
+
"""
|
|
27
|
+
:param log: pandas or Spark DataFrame
|
|
28
|
+
It is important for ``log`` to contain all available items.
|
|
29
|
+
"""
|
|
30
|
+
self.items = convert2spark(log).select("item_idx").distinct()
|
|
31
|
+
self.item_count = self.items.count()
|
|
32
|
+
|
|
33
|
+
@staticmethod
|
|
34
|
+
def _get_metric_value_by_user(k, *args):
|
|
35
|
+
# not averaged by users
|
|
36
|
+
pass
|
|
37
|
+
|
|
38
|
+
def _get_enriched_recommendations(
|
|
39
|
+
self,
|
|
40
|
+
recommendations: DataFrameLike,
|
|
41
|
+
ground_truth: DataFrameLike, # noqa: ARG002
|
|
42
|
+
max_k: int, # noqa: ARG002
|
|
43
|
+
ground_truth_users: Optional[DataFrameLike] = None,
|
|
44
|
+
) -> SparkDataFrame:
|
|
45
|
+
recommendations = convert2spark(recommendations)
|
|
46
|
+
if ground_truth_users is not None:
|
|
47
|
+
ground_truth_users = convert2spark(ground_truth_users)
|
|
48
|
+
return recommendations.join(ground_truth_users, on="user_idx", how="inner")
|
|
49
|
+
return recommendations
|
|
50
|
+
|
|
51
|
+
def _conf_interval(
|
|
52
|
+
self,
|
|
53
|
+
recs: DataFrameLike, # noqa: ARG002
|
|
54
|
+
k_list: IntOrList,
|
|
55
|
+
alpha: float = 0.95, # noqa: ARG002
|
|
56
|
+
) -> Union[dict[int, float], float]:
|
|
57
|
+
if isinstance(k_list, int):
|
|
58
|
+
return 0.0
|
|
59
|
+
return dict.fromkeys(k_list, 0.0)
|
|
60
|
+
|
|
61
|
+
def _median(
|
|
62
|
+
self,
|
|
63
|
+
recs: DataFrameLike,
|
|
64
|
+
k_list: IntOrList,
|
|
65
|
+
) -> Union[dict[int, NumType], NumType]:
|
|
66
|
+
return self._mean(recs, k_list)
|
|
67
|
+
|
|
68
|
+
@process_k
|
|
69
|
+
def _mean(
|
|
70
|
+
self,
|
|
71
|
+
recs: SparkDataFrame,
|
|
72
|
+
k_list: list,
|
|
73
|
+
) -> Union[dict[int, NumType], NumType]:
|
|
74
|
+
unknown_item_count = recs.select("item_idx").distinct().exceptAll(self.items).count()
|
|
75
|
+
if unknown_item_count > 0:
|
|
76
|
+
self.logger.warning(
|
|
77
|
+
"Recommendations contain items that were not present in the log. "
|
|
78
|
+
r"The resulting metric value can be more than 1.0 ¯\_(ツ)_/¯"
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
best_positions = (
|
|
82
|
+
recs.withColumn(
|
|
83
|
+
"row_num",
|
|
84
|
+
sf.row_number().over(Window.partitionBy("user_idx").orderBy(sf.desc("relevance"))),
|
|
85
|
+
)
|
|
86
|
+
.select("item_idx", "row_num")
|
|
87
|
+
.groupBy("item_idx")
|
|
88
|
+
.agg(sf.min("row_num").alias("best_position"))
|
|
89
|
+
.cache()
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
res = {}
|
|
93
|
+
for current_k in k_list:
|
|
94
|
+
res[current_k] = best_positions.filter(sf.col("best_position") <= current_k).count() / self.item_count
|
|
95
|
+
|
|
96
|
+
best_positions.unpersist()
|
|
97
|
+
return res
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
from typing import Any, Optional
|
|
2
|
+
|
|
3
|
+
from replay.utils import DataFrameLike, IntOrList, NumType, PandasDataFrame
|
|
4
|
+
from replay.utils.spark_utils import convert2spark
|
|
5
|
+
|
|
6
|
+
from .base_metric import Metric, NCISMetric, RecOnlyMetric, get_enriched_recommendations
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Experiment:
|
|
10
|
+
"""
|
|
11
|
+
This class calculates and stores metric values.
|
|
12
|
+
Initialize it with test data and a dictionary mapping metrics to their depth cut-offs.
|
|
13
|
+
|
|
14
|
+
Results are available with ``pandas_df`` attribute.
|
|
15
|
+
|
|
16
|
+
Example:
|
|
17
|
+
|
|
18
|
+
>>> import pandas as pd
|
|
19
|
+
>>> from replay.experimental.metrics import Coverage, NDCG, Precision, Surprisal
|
|
20
|
+
>>> log = pd.DataFrame({"user_idx": [2, 2, 2, 1], "item_idx": [1, 2, 3, 3], "relevance": [5, 5, 5, 5]})
|
|
21
|
+
>>> test = pd.DataFrame({"user_idx": [1, 1, 1], "item_idx": [1, 2, 3], "relevance": [5, 3, 4]})
|
|
22
|
+
>>> pred = pd.DataFrame({"user_idx": [1, 1, 1], "item_idx": [4, 1, 3], "relevance": [5, 4, 5]})
|
|
23
|
+
>>> recs = pd.DataFrame({"user_idx": [1, 1, 1], "item_idx": [1, 4, 5], "relevance": [5, 4, 5]})
|
|
24
|
+
>>> ex = Experiment(test, {NDCG(): [2, 3], Surprisal(log): 3})
|
|
25
|
+
>>> ex.add_result("baseline", recs)
|
|
26
|
+
>>> ex.add_result("baseline_gt_users", recs, ground_truth_users=pd.DataFrame({"user_idx": [1, 3]}))
|
|
27
|
+
>>> ex.add_result("model", pred)
|
|
28
|
+
>>> ex.results
|
|
29
|
+
NDCG@2 NDCG@3 Surprisal@3
|
|
30
|
+
baseline 0.386853 0.296082 1.000000
|
|
31
|
+
baseline_gt_users 0.193426 0.148041 0.500000
|
|
32
|
+
model 0.386853 0.530721 0.666667
|
|
33
|
+
>>> ex.compare("baseline")
|
|
34
|
+
NDCG@2 NDCG@3 Surprisal@3
|
|
35
|
+
baseline – – –
|
|
36
|
+
baseline_gt_users -50.0% -50.0% -50.0%
|
|
37
|
+
model 0.0% 79.25% -33.33%
|
|
38
|
+
>>> ex = Experiment(test, {Precision(): [3]}, calc_median=True, calc_conf_interval=0.95)
|
|
39
|
+
>>> ex.add_result("baseline", recs)
|
|
40
|
+
>>> ex.add_result("model", pred)
|
|
41
|
+
>>> ex.results
|
|
42
|
+
Precision@3 Precision@3_median Precision@3_0.95_conf_interval
|
|
43
|
+
baseline 0.333333 0.333333 0.0
|
|
44
|
+
model 0.666667 0.666667 0.0
|
|
45
|
+
>>> ex = Experiment(test, {Coverage(log): 3}, calc_median=True, calc_conf_interval=0.95)
|
|
46
|
+
>>> ex.add_result("baseline", recs)
|
|
47
|
+
>>> ex.add_result("model", pred)
|
|
48
|
+
>>> ex.results
|
|
49
|
+
Coverage@3 Coverage@3_median Coverage@3_0.95_conf_interval
|
|
50
|
+
baseline 1.0 1.0 0.0
|
|
51
|
+
model 1.0 1.0 0.0
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
test: Any,
|
|
57
|
+
metrics: dict[Metric, IntOrList],
|
|
58
|
+
calc_median: bool = False,
|
|
59
|
+
calc_conf_interval: Optional[float] = None,
|
|
60
|
+
):
|
|
61
|
+
"""
|
|
62
|
+
:param test: test DataFrame
|
|
63
|
+
:param metrics: Dictionary of metrics to calculate.
|
|
64
|
+
Key -- metric, value -- ``int`` or a list of ints.
|
|
65
|
+
:param calc_median: flag to calculate median value across users
|
|
66
|
+
:param calc_conf_interval: quantile value for the calculation of the confidence interval.
|
|
67
|
+
Resulting value is the half of confidence interval.
|
|
68
|
+
"""
|
|
69
|
+
self.test = convert2spark(test)
|
|
70
|
+
self.results = PandasDataFrame()
|
|
71
|
+
self.metrics = metrics
|
|
72
|
+
self.calc_median = calc_median
|
|
73
|
+
self.calc_conf_interval = calc_conf_interval
|
|
74
|
+
|
|
75
|
+
def add_result(
|
|
76
|
+
self,
|
|
77
|
+
name: str,
|
|
78
|
+
pred: DataFrameLike,
|
|
79
|
+
ground_truth_users: Optional[DataFrameLike] = None,
|
|
80
|
+
) -> None:
|
|
81
|
+
"""
|
|
82
|
+
Calculate metrics for predictions
|
|
83
|
+
|
|
84
|
+
:param name: name of the run to store in the resulting DataFrame
|
|
85
|
+
:param pred: model recommendations
|
|
86
|
+
:param ground_truth_users: list of users to consider in metric calculation.
|
|
87
|
+
if None, only the users from ground_truth are considered.
|
|
88
|
+
"""
|
|
89
|
+
max_k = 0
|
|
90
|
+
for current_k in self.metrics.values():
|
|
91
|
+
max_k = max((*current_k, max_k) if isinstance(current_k, list) else (current_k, max_k))
|
|
92
|
+
|
|
93
|
+
recs = get_enriched_recommendations(pred, self.test, max_k, ground_truth_users).cache()
|
|
94
|
+
for metric, k_list in sorted(self.metrics.items(), key=lambda x: str(x[0])):
|
|
95
|
+
enriched = None
|
|
96
|
+
if isinstance(metric, (RecOnlyMetric, NCISMetric)):
|
|
97
|
+
enriched = metric._get_enriched_recommendations(pred, self.test, max_k, ground_truth_users)
|
|
98
|
+
values, median, conf_interval = self._calculate(metric, enriched or recs, k_list)
|
|
99
|
+
|
|
100
|
+
if isinstance(k_list, int):
|
|
101
|
+
self._add_metric(
|
|
102
|
+
name,
|
|
103
|
+
metric,
|
|
104
|
+
k_list,
|
|
105
|
+
values,
|
|
106
|
+
median,
|
|
107
|
+
conf_interval,
|
|
108
|
+
)
|
|
109
|
+
else:
|
|
110
|
+
for k, val in sorted(values.items(), key=lambda x: x[0]):
|
|
111
|
+
self._add_metric(
|
|
112
|
+
name,
|
|
113
|
+
metric,
|
|
114
|
+
k,
|
|
115
|
+
val,
|
|
116
|
+
None if median is None else median[k],
|
|
117
|
+
None if conf_interval is None else conf_interval[k],
|
|
118
|
+
)
|
|
119
|
+
recs.unpersist()
|
|
120
|
+
|
|
121
|
+
def _calculate(self, metric, enriched, k_list):
|
|
122
|
+
median = None
|
|
123
|
+
conf_interval = None
|
|
124
|
+
values = metric._mean(enriched, k_list)
|
|
125
|
+
if self.calc_median:
|
|
126
|
+
median = metric._median(enriched, k_list)
|
|
127
|
+
if self.calc_conf_interval is not None:
|
|
128
|
+
conf_interval = metric._conf_interval(enriched, k_list, self.calc_conf_interval)
|
|
129
|
+
return values, median, conf_interval
|
|
130
|
+
|
|
131
|
+
def _add_metric(
|
|
132
|
+
self,
|
|
133
|
+
name: str,
|
|
134
|
+
metric: Metric,
|
|
135
|
+
k: int,
|
|
136
|
+
value: NumType,
|
|
137
|
+
median: Optional[NumType],
|
|
138
|
+
conf_interval: Optional[NumType],
|
|
139
|
+
):
|
|
140
|
+
"""
|
|
141
|
+
Add metric for a specific k
|
|
142
|
+
|
|
143
|
+
:param name: name to save results
|
|
144
|
+
:param metric: metric object
|
|
145
|
+
:param k: length of the recommendation list
|
|
146
|
+
:param value: metric value
|
|
147
|
+
:param median: median value
|
|
148
|
+
:param conf_interval: confidence interval value
|
|
149
|
+
"""
|
|
150
|
+
self.results.at[name, f"{metric}@{k}"] = value
|
|
151
|
+
if median is not None:
|
|
152
|
+
self.results.at[name, f"{metric}@{k}_median"] = median
|
|
153
|
+
if conf_interval is not None:
|
|
154
|
+
self.results.at[name, f"{metric}@{k}_{self.calc_conf_interval}_conf_interval"] = conf_interval
|
|
155
|
+
|
|
156
|
+
def compare(self, name: str) -> PandasDataFrame:
|
|
157
|
+
"""
|
|
158
|
+
Show results as a percentage difference to record ``name``.
|
|
159
|
+
|
|
160
|
+
:param name: name of the baseline record
|
|
161
|
+
:return: results table in a percentage format
|
|
162
|
+
"""
|
|
163
|
+
if name not in self.results.index:
|
|
164
|
+
msg = f"No results for model {name}"
|
|
165
|
+
raise ValueError(msg)
|
|
166
|
+
columns = [column for column in self.results.columns if column[-1].isdigit()]
|
|
167
|
+
data_frame = self.results[columns].copy()
|
|
168
|
+
baseline = data_frame.loc[name]
|
|
169
|
+
for idx in data_frame.index:
|
|
170
|
+
if idx != name:
|
|
171
|
+
diff = data_frame.loc[idx] / baseline - 1
|
|
172
|
+
data_frame.loc[idx] = [str(round(v * 100, 2)) + "%" for v in diff]
|
|
173
|
+
else:
|
|
174
|
+
data_frame.loc[name] = ["–"] * len(baseline)
|
|
175
|
+
return data_frame
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from .base_metric import Metric
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class HitRate(Metric):
|
|
5
|
+
"""
|
|
6
|
+
Percentage of users that have at least one
|
|
7
|
+
correctly recommended item among top-k.
|
|
8
|
+
|
|
9
|
+
.. math::
|
|
10
|
+
HitRate@K(i) = \\max_{j \\in [1..K]}\\mathbb{1}_{r_{ij}}
|
|
11
|
+
|
|
12
|
+
.. math::
|
|
13
|
+
HitRate@K = \\frac {\\sum_{i=1}^{N}HitRate@K(i)}{N}
|
|
14
|
+
|
|
15
|
+
:math:`\\mathbb{1}_{r_{ij}}` -- indicator function stating that user :math:`i` interacted with item :math:`j`
|
|
16
|
+
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
_scala_udf_name = "getHitRateMetricValue"
|
|
20
|
+
|
|
21
|
+
@staticmethod
|
|
22
|
+
def _get_metric_value_by_user(k, pred, ground_truth) -> float:
|
|
23
|
+
for i in pred[:k]:
|
|
24
|
+
if i in ground_truth:
|
|
25
|
+
return 1
|
|
26
|
+
return 0
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from .base_metric import Metric
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class MAP(Metric):
|
|
5
|
+
"""
|
|
6
|
+
Mean Average Precision -- average the ``Precision`` at relevant positions for each user,
|
|
7
|
+
and then calculate the mean across all users.
|
|
8
|
+
|
|
9
|
+
.. math::
|
|
10
|
+
&AP@K(i) = \\frac 1K \\sum_{j=1}^{K}\\mathbb{1}_{r_{ij}}Precision@j(i)
|
|
11
|
+
|
|
12
|
+
&MAP@K = \\frac {\\sum_{i=1}^{N}AP@K(i)}{N}
|
|
13
|
+
|
|
14
|
+
:math:`\\mathbb{1}_{r_{ij}}` -- indicator function showing if user :math:`i` interacted with item :math:`j`
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
_scala_udf_name = "getMAPMetricValue"
|
|
18
|
+
|
|
19
|
+
@staticmethod
|
|
20
|
+
def _get_metric_value_by_user(k, pred, ground_truth) -> float:
|
|
21
|
+
length = min(k, len(pred))
|
|
22
|
+
if len(ground_truth) == 0 or len(pred) == 0:
|
|
23
|
+
return 0
|
|
24
|
+
tp_cum = 0
|
|
25
|
+
result = 0
|
|
26
|
+
for i in range(length):
|
|
27
|
+
if pred[i] in ground_truth:
|
|
28
|
+
tp_cum += 1
|
|
29
|
+
result += tp_cum / (i + 1)
|
|
30
|
+
return result / k
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from .base_metric import Metric
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class MRR(Metric):
|
|
5
|
+
"""
|
|
6
|
+
Mean Reciprocal Rank --
|
|
7
|
+
Reciprocal Rank is the inverse position of the first relevant item among top-k recommendations,
|
|
8
|
+
:math:`\\frac {1}{rank_i}`. This value is averaged by all users.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
_scala_udf_name = "getMRRMetricValue"
|
|
12
|
+
|
|
13
|
+
@staticmethod
|
|
14
|
+
def _get_metric_value_by_user(k, pred, ground_truth) -> float:
|
|
15
|
+
for i in range(min(k, len(pred))):
|
|
16
|
+
if pred[i] in ground_truth:
|
|
17
|
+
return 1 / (1 + i)
|
|
18
|
+
return 0
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
from .base_metric import NCISMetric
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class NCISPrecision(NCISMetric):
|
|
7
|
+
"""
|
|
8
|
+
Share of relevant items among top ``K`` recommendations with NCIS weighting.
|
|
9
|
+
|
|
10
|
+
.. math::
|
|
11
|
+
Precision@K(i) = \\frac {\\sum_{j=1}^{K}\\mathbb{1}_{r_{ij} w_{ij}}}{\\sum_{j=1}^{K} w_{ij}}
|
|
12
|
+
|
|
13
|
+
.. math::
|
|
14
|
+
Precision@K = \\frac {\\sum_{i=1}^{N}Precision@K(i)}{N}
|
|
15
|
+
|
|
16
|
+
:math:`\\mathbb{1}_{r_{ij}}` -- indicator function
|
|
17
|
+
showing that user :math:`i` interacted with item :math:`j`
|
|
18
|
+
:math:`w_{ij}` -- NCIS weight, calculated as ratio of current policy score on previous
|
|
19
|
+
policy score with clipping and optional activation over policy scores (relevance).
|
|
20
|
+
Source: arxiv.org/abs/1801.07030
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
_scala_udf_name = "getNCISPrecisionMetricValue"
|
|
24
|
+
|
|
25
|
+
@staticmethod
|
|
26
|
+
def _get_metric_value_by_user(k, *args):
|
|
27
|
+
pred, ground_truth, pred_weights = args
|
|
28
|
+
if len(pred) == 0 or len(ground_truth) == 0:
|
|
29
|
+
return 0
|
|
30
|
+
mask = np.isin(pred[:k], ground_truth)
|
|
31
|
+
return sum(np.array(pred_weights)[mask]) / sum(pred_weights[:k])
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import math
|
|
2
|
+
|
|
3
|
+
from .base_metric import Metric
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class NDCG(Metric):
|
|
7
|
+
"""
|
|
8
|
+
Normalized Discounted Cumulative Gain is a metric
|
|
9
|
+
that takes into account positions of relevant items.
|
|
10
|
+
|
|
11
|
+
This is the binary version, it takes into account
|
|
12
|
+
whether the item was consumed or not, relevance value is ignored.
|
|
13
|
+
|
|
14
|
+
.. math::
|
|
15
|
+
DCG@K(i) = \\sum_{j=1}^{K}\\frac{\\mathbb{1}_{r_{ij}}}{\\log_2 (j+1)}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
:math:`\\mathbb{1}_{r_{ij}}` -- indicator function showing that user :math:`i` interacted with item :math:`j`
|
|
19
|
+
|
|
20
|
+
To get from :math:`DCG` to :math:`nDCG` we calculate the biggest possible value of `DCG`
|
|
21
|
+
for user :math:`i` and recommendation length :math:`K`.
|
|
22
|
+
|
|
23
|
+
.. math::
|
|
24
|
+
IDCG@K(i) = max(DCG@K(i)) = \\sum_{j=1}^{K}\\frac{\\mathbb{1}_{j\\le|Rel_i|}}{\\log_2 (j+1)}
|
|
25
|
+
|
|
26
|
+
.. math::
|
|
27
|
+
nDCG@K(i) = \\frac {DCG@K(i)}{IDCG@K(i)}
|
|
28
|
+
|
|
29
|
+
:math:`|Rel_i|` -- number of relevant items for user :math:`i`
|
|
30
|
+
|
|
31
|
+
Metric is averaged by users.
|
|
32
|
+
|
|
33
|
+
.. math::
|
|
34
|
+
nDCG@K = \\frac {\\sum_{i=1}^{N}nDCG@K(i)}{N}
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
_scala_udf_name = "getNDCGMetricValue"
|
|
38
|
+
|
|
39
|
+
@staticmethod
|
|
40
|
+
def _get_metric_value_by_user(k, pred, ground_truth) -> float:
|
|
41
|
+
if len(pred) == 0 or len(ground_truth) == 0:
|
|
42
|
+
return 0.0
|
|
43
|
+
pred_len = min(k, len(pred))
|
|
44
|
+
ground_truth_len = min(k, len(ground_truth))
|
|
45
|
+
denom = [1 / math.log2(i + 2) for i in range(k)]
|
|
46
|
+
dcg = sum(denom[i] for i in range(pred_len) if pred[i] in ground_truth)
|
|
47
|
+
idcg = sum(denom[:ground_truth_len])
|
|
48
|
+
|
|
49
|
+
return dcg / idcg
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from .base_metric import Metric
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class Precision(Metric):
|
|
5
|
+
"""
|
|
6
|
+
Mean percentage of relevant items among top ``K`` recommendations.
|
|
7
|
+
|
|
8
|
+
.. math::
|
|
9
|
+
Precision@K(i) = \\frac {\\sum_{j=1}^{K}\\mathbb{1}_{r_{ij}}}{K}
|
|
10
|
+
|
|
11
|
+
.. math::
|
|
12
|
+
Precision@K = \\frac {\\sum_{i=1}^{N}Precision@K(i)}{N}
|
|
13
|
+
|
|
14
|
+
:math:`\\mathbb{1}_{r_{ij}}` -- indicator function showing that user :math:`i` interacted with item :math:`j`"""
|
|
15
|
+
|
|
16
|
+
_scala_udf_name = "getPrecisionMetricValue"
|
|
17
|
+
|
|
18
|
+
@staticmethod
|
|
19
|
+
def _get_metric_value_by_user(k, pred, ground_truth) -> float:
|
|
20
|
+
if len(pred) == 0:
|
|
21
|
+
return 0
|
|
22
|
+
return len(set(pred[:k]) & set(ground_truth)) / k
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from .base_metric import Metric
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class Recall(Metric):
|
|
5
|
+
"""
|
|
6
|
+
Mean percentage of relevant items, that was shown among top ``K`` recommendations.
|
|
7
|
+
|
|
8
|
+
.. math::
|
|
9
|
+
Recall@K(i) = \\frac {\\sum_{j=1}^{K}\\mathbb{1}_{r_{ij}}}{|Rel_i|}
|
|
10
|
+
|
|
11
|
+
.. math::
|
|
12
|
+
Recall@K = \\frac {\\sum_{i=1}^{N}Recall@K(i)}{N}
|
|
13
|
+
|
|
14
|
+
:math:`\\mathbb{1}_{r_{ij}}` -- indicator function showing that user :math:`i` interacted with item :math:`j`
|
|
15
|
+
|
|
16
|
+
:math:`|Rel_i|` -- the number of relevant items for user :math:`i`
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
_scala_udf_name = "getRecallMetricValue"
|
|
20
|
+
|
|
21
|
+
@staticmethod
|
|
22
|
+
def _get_metric_value_by_user(k, pred, ground_truth) -> float:
|
|
23
|
+
if len(ground_truth) == 0:
|
|
24
|
+
return 0.0
|
|
25
|
+
return len(set(pred[:k]) & set(ground_truth)) / len(ground_truth)
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from .base_metric import Metric
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class RocAuc(Metric):
|
|
5
|
+
"""
|
|
6
|
+
Receiver Operating Characteristic/Area Under the Curve is the aggregated performance measure,
|
|
7
|
+
that depends only on the order of recommended items.
|
|
8
|
+
It can be interpreted as the fraction of object pairs (object of class 1, object of class 0)
|
|
9
|
+
that were correctly ordered by the model.
|
|
10
|
+
The bigger the value of AUC, the better the classification model.
|
|
11
|
+
|
|
12
|
+
.. math::
|
|
13
|
+
ROCAUC@K(i) = \\frac {\\sum_{s=1}^{K}\\sum_{t=1}^{K}
|
|
14
|
+
\\mathbb{1}_{r_{si}<r_{ti}}
|
|
15
|
+
\\mathbb{1}_{gt_{si}<gt_{ti}}}
|
|
16
|
+
{\\sum_{s=1}^{K}\\sum_{t=1}^{K} \\mathbb{1}_{gt_{si}<gt_{tj}}}
|
|
17
|
+
|
|
18
|
+
:math:`\\mathbb{1}_{r_{si}<r_{ti}}` -- indicator function showing that recommendation score for
|
|
19
|
+
user :math:`i` for item :math:`s` is bigger than for item :math:`t`
|
|
20
|
+
|
|
21
|
+
:math:`\\mathbb{1}_{gt_{si}<gt_{ti}}` -- indicator function showing that
|
|
22
|
+
user :math:`i` values item :math:`s` more than item :math:`t`.
|
|
23
|
+
|
|
24
|
+
Metric is averaged by all users.
|
|
25
|
+
|
|
26
|
+
.. math::
|
|
27
|
+
ROCAUC@K = \\frac {\\sum_{i=1}^{N}ROCAUC@K(i)}{N}
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
_scala_udf_name = "getRocAucMetricValue"
|
|
31
|
+
|
|
32
|
+
@staticmethod
|
|
33
|
+
def _get_metric_value_by_user(k, pred, ground_truth) -> float:
|
|
34
|
+
length = min(k, len(pred))
|
|
35
|
+
if len(ground_truth) == 0 or len(pred) == 0:
|
|
36
|
+
return 0
|
|
37
|
+
|
|
38
|
+
fp_cur = 0
|
|
39
|
+
fp_cum = 0
|
|
40
|
+
for item in pred[:length]:
|
|
41
|
+
if item in ground_truth:
|
|
42
|
+
fp_cum += fp_cur
|
|
43
|
+
else:
|
|
44
|
+
fp_cur += 1
|
|
45
|
+
if fp_cur == length:
|
|
46
|
+
return 0
|
|
47
|
+
if fp_cum == 0:
|
|
48
|
+
return 1
|
|
49
|
+
return 1 - fp_cum / (fp_cur * (length - fp_cur))
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
from replay.utils import PYSPARK_AVAILABLE, DataFrameLike, SparkDataFrame
|
|
6
|
+
from replay.utils.spark_utils import convert2spark, get_top_k_recs
|
|
7
|
+
|
|
8
|
+
from .base_metric import RecOnlyMetric, fill_na_with_empty_array, filter_sort
|
|
9
|
+
|
|
10
|
+
if PYSPARK_AVAILABLE:
|
|
11
|
+
from pyspark.sql import (
|
|
12
|
+
functions as sf,
|
|
13
|
+
types as st,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Surprisal(RecOnlyMetric):
|
|
18
|
+
"""
|
|
19
|
+
Measures how many surprising rare items are present in recommendations.
|
|
20
|
+
|
|
21
|
+
.. math::
|
|
22
|
+
\\textit{Self-Information}(j)= -\\log_2 \\frac {u_j}{N}
|
|
23
|
+
|
|
24
|
+
:math:`u_j` -- number of users that interacted with item :math:`j`.
|
|
25
|
+
Cold items are treated as if they were rated by 1 user.
|
|
26
|
+
That is, if they appear in recommendations it will be completely unexpected.
|
|
27
|
+
|
|
28
|
+
Metric is normalized.
|
|
29
|
+
|
|
30
|
+
Surprisal for item :math:`j` is
|
|
31
|
+
|
|
32
|
+
.. math::
|
|
33
|
+
Surprisal(j)= \\frac {\\textit{Self-Information}(j)}{log_2 N}
|
|
34
|
+
|
|
35
|
+
Recommendation list surprisal is the average surprisal of items in it.
|
|
36
|
+
|
|
37
|
+
.. math::
|
|
38
|
+
Surprisal@K(i) = \\frac {\\sum_{j=1}^{K}Surprisal(j)} {K}
|
|
39
|
+
|
|
40
|
+
Final metric is averaged by users.
|
|
41
|
+
|
|
42
|
+
.. math::
|
|
43
|
+
Surprisal@K = \\frac {\\sum_{i=1}^{N}Surprisal@K(i)}{N}
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
_scala_udf_name = "getSurprisalMetricValue"
|
|
47
|
+
|
|
48
|
+
def __init__(self, log: DataFrameLike, use_scala_udf: bool = False):
|
|
49
|
+
"""
|
|
50
|
+
Here we calculate self-information for each item
|
|
51
|
+
|
|
52
|
+
:param log: historical data
|
|
53
|
+
"""
|
|
54
|
+
self._use_scala_udf = use_scala_udf
|
|
55
|
+
self.log = convert2spark(log)
|
|
56
|
+
n_users = self.log.select("user_idx").distinct().count()
|
|
57
|
+
self.item_weights = self.log.groupby("item_idx").agg(
|
|
58
|
+
(sf.log2(n_users / sf.countDistinct("user_idx")) / np.log2(n_users)).alias("rec_weight")
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
@staticmethod
|
|
62
|
+
def _get_metric_value_by_user(k, *args):
|
|
63
|
+
weigths = args[0]
|
|
64
|
+
return sum(weigths[:k]) / k
|
|
65
|
+
|
|
66
|
+
def _get_enriched_recommendations(
|
|
67
|
+
self,
|
|
68
|
+
recommendations: SparkDataFrame,
|
|
69
|
+
ground_truth: SparkDataFrame, # noqa: ARG002
|
|
70
|
+
max_k: int,
|
|
71
|
+
ground_truth_users: Optional[DataFrameLike] = None,
|
|
72
|
+
) -> SparkDataFrame:
|
|
73
|
+
recommendations = convert2spark(recommendations)
|
|
74
|
+
ground_truth_users = convert2spark(ground_truth_users)
|
|
75
|
+
recommendations = get_top_k_recs(recommendations, max_k)
|
|
76
|
+
|
|
77
|
+
recommendations = recommendations.join(self.item_weights, on="item_idx", how="left").fillna(1.0)
|
|
78
|
+
recommendations = filter_sort(recommendations, "rec_weight")
|
|
79
|
+
recommendations = recommendations.select("user_idx", sf.col("rec_weight")).withColumn(
|
|
80
|
+
"rec_weight",
|
|
81
|
+
sf.col("rec_weight").cast(st.ArrayType(st.DoubleType(), True)),
|
|
82
|
+
)
|
|
83
|
+
if ground_truth_users is not None:
|
|
84
|
+
recommendations = fill_na_with_empty_array(
|
|
85
|
+
recommendations.join(ground_truth_users, on="user_idx", how="right"),
|
|
86
|
+
"rec_weight",
|
|
87
|
+
self.item_weights.schema["rec_weight"].dataType,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
return recommendations
|