replay-rec 0.20.0__py3-none-any.whl → 0.20.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- replay/__init__.py +1 -1
- replay/data/dataset.py +10 -9
- replay/data/dataset_utils/dataset_label_encoder.py +5 -4
- replay/data/nn/schema.py +9 -18
- replay/data/nn/sequence_tokenizer.py +26 -18
- replay/data/nn/sequential_dataset.py +22 -18
- replay/data/nn/torch_sequential_dataset.py +17 -16
- replay/data/nn/utils.py +2 -1
- replay/data/schema.py +3 -12
- replay/metrics/base_metric.py +11 -10
- replay/metrics/categorical_diversity.py +8 -8
- replay/metrics/coverage.py +4 -4
- replay/metrics/experiment.py +3 -3
- replay/metrics/hitrate.py +1 -3
- replay/metrics/map.py +1 -3
- replay/metrics/mrr.py +1 -3
- replay/metrics/ndcg.py +1 -2
- replay/metrics/novelty.py +3 -3
- replay/metrics/offline_metrics.py +16 -16
- replay/metrics/precision.py +1 -3
- replay/metrics/recall.py +1 -3
- replay/metrics/rocauc.py +1 -3
- replay/metrics/surprisal.py +4 -4
- replay/metrics/torch_metrics_builder.py +13 -12
- replay/metrics/unexpectedness.py +2 -2
- replay/models/als.py +2 -2
- replay/models/association_rules.py +4 -3
- replay/models/base_neighbour_rec.py +3 -2
- replay/models/base_rec.py +11 -10
- replay/models/cat_pop_rec.py +2 -1
- replay/models/extensions/ann/ann_mixin.py +2 -1
- replay/models/extensions/ann/index_builders/executor_hnswlib_index_builder.py +2 -1
- replay/models/extensions/ann/index_builders/executor_nmslib_index_builder.py +2 -1
- replay/models/lin_ucb.py +57 -11
- replay/models/nn/optimizer_utils/optimizer_factory.py +2 -2
- replay/models/nn/sequential/bert4rec/dataset.py +5 -18
- replay/models/nn/sequential/bert4rec/lightning.py +3 -3
- replay/models/nn/sequential/bert4rec/model.py +2 -2
- replay/models/nn/sequential/callbacks/prediction_callbacks.py +12 -12
- replay/models/nn/sequential/callbacks/validation_callback.py +9 -9
- replay/models/nn/sequential/compiled/base_compiled_model.py +5 -5
- replay/models/nn/sequential/postprocessors/_base.py +2 -3
- replay/models/nn/sequential/postprocessors/postprocessors.py +11 -11
- replay/models/nn/sequential/sasrec/dataset.py +3 -16
- replay/models/nn/sequential/sasrec/lightning.py +3 -3
- replay/models/nn/sequential/sasrec/model.py +8 -8
- replay/models/slim.py +2 -2
- replay/models/ucb.py +2 -2
- replay/models/word2vec.py +3 -3
- replay/preprocessing/discretizer.py +8 -7
- replay/preprocessing/filters.py +4 -4
- replay/preprocessing/history_based_fp.py +6 -6
- replay/preprocessing/label_encoder.py +8 -7
- replay/scenarios/fallback.py +4 -3
- replay/splitters/base_splitter.py +3 -3
- replay/splitters/cold_user_random_splitter.py +4 -4
- replay/splitters/k_folds.py +4 -4
- replay/splitters/last_n_splitter.py +10 -10
- replay/splitters/new_users_splitter.py +4 -4
- replay/splitters/random_splitter.py +4 -4
- replay/splitters/ratio_splitter.py +10 -10
- replay/splitters/time_splitter.py +6 -6
- replay/splitters/two_stage_splitter.py +4 -4
- replay/utils/__init__.py +1 -1
- replay/utils/common.py +1 -1
- replay/utils/session_handler.py +2 -2
- replay/utils/spark_utils.py +6 -5
- replay/utils/types.py +3 -1
- {replay_rec-0.20.0.dist-info → replay_rec-0.20.1.dist-info}/METADATA +7 -1
- {replay_rec-0.20.0.dist-info → replay_rec-0.20.1.dist-info}/RECORD +73 -74
- replay/utils/warnings.py +0 -26
- {replay_rec-0.20.0.dist-info → replay_rec-0.20.1.dist-info}/WHEEL +0 -0
- {replay_rec-0.20.0.dist-info → replay_rec-0.20.1.dist-info}/licenses/LICENSE +0 -0
- {replay_rec-0.20.0.dist-info → replay_rec-0.20.1.dist-info}/licenses/NOTICE +0 -0
replay/__init__.py
CHANGED
replay/data/dataset.py
CHANGED
|
@@ -5,8 +5,9 @@
|
|
|
5
5
|
from __future__ import annotations
|
|
6
6
|
|
|
7
7
|
import json
|
|
8
|
+
from collections.abc import Iterable, Sequence
|
|
8
9
|
from pathlib import Path
|
|
9
|
-
from typing import Callable,
|
|
10
|
+
from typing import Callable, Optional, Union
|
|
10
11
|
|
|
11
12
|
import numpy as np
|
|
12
13
|
from pandas import read_parquet as pd_read_parquet
|
|
@@ -315,7 +316,7 @@ class Dataset:
|
|
|
315
316
|
:returns: Loaded Dataset.
|
|
316
317
|
"""
|
|
317
318
|
base_path = Path(path).with_suffix(".replay").resolve()
|
|
318
|
-
with open(base_path / "init_args.json"
|
|
319
|
+
with open(base_path / "init_args.json") as file:
|
|
319
320
|
dataset_dict = json.loads(file.read())
|
|
320
321
|
|
|
321
322
|
if dataframe_type not in ["pandas", "spark", "polars", None]:
|
|
@@ -436,14 +437,14 @@ class Dataset:
|
|
|
436
437
|
)
|
|
437
438
|
|
|
438
439
|
def _get_feature_source_map(self):
|
|
439
|
-
self._feature_source_map:
|
|
440
|
+
self._feature_source_map: dict[FeatureSource, DataFrameLike] = {
|
|
440
441
|
FeatureSource.INTERACTIONS: self.interactions,
|
|
441
442
|
FeatureSource.QUERY_FEATURES: self.query_features,
|
|
442
443
|
FeatureSource.ITEM_FEATURES: self.item_features,
|
|
443
444
|
}
|
|
444
445
|
|
|
445
446
|
def _get_ids_source_map(self):
|
|
446
|
-
self._ids_feature_map:
|
|
447
|
+
self._ids_feature_map: dict[FeatureHint, DataFrameLike] = {
|
|
447
448
|
FeatureHint.QUERY_ID: self.query_features if self.query_features is not None else self.interactions,
|
|
448
449
|
FeatureHint.ITEM_ID: self.item_features if self.item_features is not None else self.interactions,
|
|
449
450
|
}
|
|
@@ -499,10 +500,10 @@ class Dataset:
|
|
|
499
500
|
)
|
|
500
501
|
return FeatureSchema(features_list=features_list + filled_features)
|
|
501
502
|
|
|
502
|
-
def _fill_unlabeled_features_sources(self, feature_schema: FeatureSchema) ->
|
|
503
|
+
def _fill_unlabeled_features_sources(self, feature_schema: FeatureSchema) -> list[FeatureInfo]:
|
|
503
504
|
features_list = list(feature_schema.all_features)
|
|
504
505
|
|
|
505
|
-
source_mapping:
|
|
506
|
+
source_mapping: dict[str, FeatureSource] = {}
|
|
506
507
|
for source in FeatureSource:
|
|
507
508
|
dataframe = self._feature_source_map[source]
|
|
508
509
|
if dataframe is not None:
|
|
@@ -524,7 +525,7 @@ class Dataset:
|
|
|
524
525
|
self._set_cardinality(features_list=features_list)
|
|
525
526
|
return features_list
|
|
526
527
|
|
|
527
|
-
def _get_unlabeled_columns(self, source: FeatureSource, feature_schema: FeatureSchema) ->
|
|
528
|
+
def _get_unlabeled_columns(self, source: FeatureSource, feature_schema: FeatureSchema) -> list[FeatureInfo]:
|
|
528
529
|
set_source_dataframe_columns = set(self._feature_source_map[source].columns)
|
|
529
530
|
set_labeled_dataframe_columns = set(feature_schema.columns)
|
|
530
531
|
unlabeled_columns = set_source_dataframe_columns - set_labeled_dataframe_columns
|
|
@@ -534,13 +535,13 @@ class Dataset:
|
|
|
534
535
|
]
|
|
535
536
|
return unlabeled_features_list
|
|
536
537
|
|
|
537
|
-
def _fill_unlabeled_features(self, source: FeatureSource, feature_schema: FeatureSchema) ->
|
|
538
|
+
def _fill_unlabeled_features(self, source: FeatureSource, feature_schema: FeatureSchema) -> list[FeatureInfo]:
|
|
538
539
|
unlabeled_columns = self._get_unlabeled_columns(source=source, feature_schema=feature_schema)
|
|
539
540
|
self._set_features_source(feature_list=unlabeled_columns, source=source)
|
|
540
541
|
self._set_cardinality(features_list=unlabeled_columns)
|
|
541
542
|
return unlabeled_columns
|
|
542
543
|
|
|
543
|
-
def _set_features_source(self, feature_list:
|
|
544
|
+
def _set_features_source(self, feature_list: list[FeatureInfo], source: FeatureSource) -> None:
|
|
544
545
|
for feature in feature_list:
|
|
545
546
|
feature._set_feature_source(source)
|
|
546
547
|
|
|
@@ -6,7 +6,8 @@ Contains classes for encoding categorical data
|
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
import warnings
|
|
9
|
-
from
|
|
9
|
+
from collections.abc import Iterable, Iterator, Sequence
|
|
10
|
+
from typing import Optional, Union
|
|
10
11
|
|
|
11
12
|
from replay.data import Dataset, FeatureHint, FeatureSchema, FeatureSource, FeatureType
|
|
12
13
|
from replay.preprocessing import LabelEncoder, LabelEncodingRule, SequenceEncodingRule
|
|
@@ -45,9 +46,9 @@ class DatasetLabelEncoder:
|
|
|
45
46
|
"""
|
|
46
47
|
self._handle_unknown_rule = handle_unknown_rule
|
|
47
48
|
self._default_value_rule = default_value_rule
|
|
48
|
-
self._encoding_rules:
|
|
49
|
+
self._encoding_rules: dict[str, LabelEncodingRule] = {}
|
|
49
50
|
|
|
50
|
-
self._features_columns:
|
|
51
|
+
self._features_columns: dict[Union[FeatureHint, FeatureSource], Sequence[str]] = {}
|
|
51
52
|
|
|
52
53
|
def fit(self, dataset: Dataset) -> "DatasetLabelEncoder":
|
|
53
54
|
"""
|
|
@@ -161,7 +162,7 @@ class DatasetLabelEncoder:
|
|
|
161
162
|
"""
|
|
162
163
|
self._check_if_initialized()
|
|
163
164
|
|
|
164
|
-
columns_set:
|
|
165
|
+
columns_set: set[str]
|
|
165
166
|
columns_set = {columns} if isinstance(columns, str) else {*columns}
|
|
166
167
|
|
|
167
168
|
def get_encoding_rules() -> Iterator[LabelEncodingRule]:
|
replay/data/nn/schema.py
CHANGED
|
@@ -1,17 +1,8 @@
|
|
|
1
|
+
from collections import OrderedDict
|
|
2
|
+
from collections.abc import ItemsView, Iterable, Iterator, KeysView, Mapping, Sequence, ValuesView
|
|
1
3
|
from typing import (
|
|
2
|
-
Dict,
|
|
3
|
-
ItemsView,
|
|
4
|
-
Iterable,
|
|
5
|
-
Iterator,
|
|
6
|
-
KeysView,
|
|
7
|
-
List,
|
|
8
|
-
Mapping,
|
|
9
4
|
Optional,
|
|
10
|
-
OrderedDict,
|
|
11
|
-
Sequence,
|
|
12
|
-
Set,
|
|
13
5
|
Union,
|
|
14
|
-
ValuesView,
|
|
15
6
|
)
|
|
16
7
|
|
|
17
8
|
import torch
|
|
@@ -20,7 +11,7 @@ from replay.data import FeatureHint, FeatureSource, FeatureType
|
|
|
20
11
|
|
|
21
12
|
# Alias
|
|
22
13
|
TensorMap = Mapping[str, torch.Tensor]
|
|
23
|
-
MutableTensorMap =
|
|
14
|
+
MutableTensorMap = dict[str, torch.Tensor]
|
|
24
15
|
|
|
25
16
|
|
|
26
17
|
class TensorFeatureSource:
|
|
@@ -79,7 +70,7 @@ class TensorFeatureInfo:
|
|
|
79
70
|
feature_type: FeatureType,
|
|
80
71
|
is_seq: bool = False,
|
|
81
72
|
feature_hint: Optional[FeatureHint] = None,
|
|
82
|
-
feature_sources: Optional[
|
|
73
|
+
feature_sources: Optional[list[TensorFeatureSource]] = None,
|
|
83
74
|
cardinality: Optional[int] = None,
|
|
84
75
|
padding_value: int = 0,
|
|
85
76
|
embedding_dim: Optional[int] = None,
|
|
@@ -154,13 +145,13 @@ class TensorFeatureInfo:
|
|
|
154
145
|
self._feature_hint = hint
|
|
155
146
|
|
|
156
147
|
@property
|
|
157
|
-
def feature_sources(self) -> Optional[
|
|
148
|
+
def feature_sources(self) -> Optional[list[TensorFeatureSource]]:
|
|
158
149
|
"""
|
|
159
150
|
:returns: List of sources feature came from.
|
|
160
151
|
"""
|
|
161
152
|
return self._feature_sources
|
|
162
153
|
|
|
163
|
-
def _set_feature_sources(self, sources:
|
|
154
|
+
def _set_feature_sources(self, sources: list[TensorFeatureSource]) -> None:
|
|
164
155
|
self._feature_sources = sources
|
|
165
156
|
|
|
166
157
|
@property
|
|
@@ -276,7 +267,7 @@ class TensorSchema(Mapping[str, TensorFeatureInfo]):
|
|
|
276
267
|
|
|
277
268
|
:returns: New tensor schema of given features.
|
|
278
269
|
"""
|
|
279
|
-
features:
|
|
270
|
+
features: set[TensorFeatureInfo] = set()
|
|
280
271
|
for feature_name in features_to_keep:
|
|
281
272
|
features.add(self._tensor_schema[feature_name])
|
|
282
273
|
return TensorSchema(list(features))
|
|
@@ -432,7 +423,7 @@ class TensorSchema(Mapping[str, TensorFeatureInfo]):
|
|
|
432
423
|
return None
|
|
433
424
|
return rating_features.item().name
|
|
434
425
|
|
|
435
|
-
def _get_object_args(self) ->
|
|
426
|
+
def _get_object_args(self) -> dict:
|
|
436
427
|
"""
|
|
437
428
|
Returns list of features represented as dictionaries.
|
|
438
429
|
"""
|
|
@@ -456,7 +447,7 @@ class TensorSchema(Mapping[str, TensorFeatureInfo]):
|
|
|
456
447
|
return features
|
|
457
448
|
|
|
458
449
|
@classmethod
|
|
459
|
-
def _create_object_by_args(cls, args:
|
|
450
|
+
def _create_object_by_args(cls, args: dict) -> "TensorSchema":
|
|
460
451
|
features_list = []
|
|
461
452
|
for feature_data in args:
|
|
462
453
|
feature_data["feature_sources"] = (
|
|
@@ -2,8 +2,9 @@ import abc
|
|
|
2
2
|
import json
|
|
3
3
|
import pickle
|
|
4
4
|
import warnings
|
|
5
|
+
from collections.abc import Sequence
|
|
5
6
|
from pathlib import Path
|
|
6
|
-
from typing import TYPE_CHECKING,
|
|
7
|
+
from typing import TYPE_CHECKING, Generic, Optional, TypeVar, Union
|
|
7
8
|
|
|
8
9
|
import numpy as np
|
|
9
10
|
import polars as pl
|
|
@@ -14,7 +15,6 @@ from replay.data import Dataset, FeatureHint, FeatureSchema, FeatureSource, Feat
|
|
|
14
15
|
from replay.data.dataset_utils import DatasetLabelEncoder
|
|
15
16
|
from replay.preprocessing import LabelEncoder, LabelEncodingRule
|
|
16
17
|
from replay.preprocessing.label_encoder import HandleUnknownStrategies
|
|
17
|
-
from replay.utils import deprecation_warning
|
|
18
18
|
|
|
19
19
|
if TYPE_CHECKING:
|
|
20
20
|
from .schema import TensorFeatureInfo, TensorFeatureSource, TensorSchema
|
|
@@ -187,7 +187,7 @@ class SequenceTokenizer:
|
|
|
187
187
|
def _group_dataset(
|
|
188
188
|
self,
|
|
189
189
|
dataset: Dataset,
|
|
190
|
-
) ->
|
|
190
|
+
) -> tuple[SequenceDataFrameLike, Optional[SequenceDataFrameLike], Optional[SequenceDataFrameLike]]:
|
|
191
191
|
from replay.data.nn.utils import ensure_pandas, groupby_sequences
|
|
192
192
|
|
|
193
193
|
grouped_interactions = groupby_sequences(
|
|
@@ -268,13 +268,13 @@ class SequenceTokenizer:
|
|
|
268
268
|
tensor_schema: "TensorSchema",
|
|
269
269
|
query_id_column: str,
|
|
270
270
|
item_id_column: str,
|
|
271
|
-
) ->
|
|
271
|
+
) -> set[str]:
|
|
272
272
|
# We need only features, which related to tensor schema, otherwise feature should
|
|
273
273
|
# be ignored for efficiency reasons. The code below does feature filtering, and
|
|
274
274
|
# keeps features used as a source in tensor schema.
|
|
275
275
|
|
|
276
276
|
# Query and item IDs are always needed
|
|
277
|
-
features_subset:
|
|
277
|
+
features_subset: list[str] = [
|
|
278
278
|
query_id_column,
|
|
279
279
|
item_id_column,
|
|
280
280
|
]
|
|
@@ -303,7 +303,7 @@ class SequenceTokenizer:
|
|
|
303
303
|
msg = "All tensor features must have sources defined"
|
|
304
304
|
raise ValueError(msg)
|
|
305
305
|
|
|
306
|
-
source_tables:
|
|
306
|
+
source_tables: list[FeatureSource] = [s.source for s in feature_sources]
|
|
307
307
|
|
|
308
308
|
unexpected_tables = list(filter(lambda x: not isinstance(x, FeatureSource), source_tables))
|
|
309
309
|
if len(unexpected_tables) > 0:
|
|
@@ -327,7 +327,7 @@ class SequenceTokenizer:
|
|
|
327
327
|
tensor_features_to_keep: Optional[Sequence[str]] = None,
|
|
328
328
|
) -> None:
|
|
329
329
|
# Check if all source columns specified in tensor schema exist in provided data frames
|
|
330
|
-
sources_for_tensors:
|
|
330
|
+
sources_for_tensors: list["TensorFeatureSource"] = []
|
|
331
331
|
for tensor_feature_name, tensor_feature in tensor_schema.items():
|
|
332
332
|
if tensor_features_to_keep is not None and tensor_feature_name not in tensor_features_to_keep:
|
|
333
333
|
continue
|
|
@@ -405,7 +405,6 @@ class SequenceTokenizer:
|
|
|
405
405
|
tensor_feature._set_cardinality(dataset_feature.cardinality)
|
|
406
406
|
|
|
407
407
|
@classmethod
|
|
408
|
-
@deprecation_warning("with `use_pickle` equals to `True` will be deprecated in future versions")
|
|
409
408
|
def load(cls, path: str, use_pickle: bool = False, **kwargs) -> "SequenceTokenizer":
|
|
410
409
|
"""
|
|
411
410
|
Load tokenizer object from the given path.
|
|
@@ -421,7 +420,7 @@ class SequenceTokenizer:
|
|
|
421
420
|
|
|
422
421
|
if not use_pickle:
|
|
423
422
|
base_path = Path(path).with_suffix(".replay").resolve()
|
|
424
|
-
with open(base_path / "init_args.json"
|
|
423
|
+
with open(base_path / "init_args.json") as file:
|
|
425
424
|
tokenizer_dict = json.loads(file.read())
|
|
426
425
|
|
|
427
426
|
# load tensor_schema, tensor_features
|
|
@@ -449,12 +448,16 @@ class SequenceTokenizer:
|
|
|
449
448
|
tokenizer._encoder._features_columns = encoder_features_columns
|
|
450
449
|
tokenizer._encoder._encoding_rules = tokenizer_dict["encoder"]["encoding_rules"]
|
|
451
450
|
else:
|
|
451
|
+
warnings.warn(
|
|
452
|
+
"with `use_pickle` equals to `True` will be deprecated in future versions",
|
|
453
|
+
DeprecationWarning,
|
|
454
|
+
stacklevel=2,
|
|
455
|
+
)
|
|
452
456
|
with open(path, "rb") as file:
|
|
453
457
|
tokenizer = pickle.load(file)
|
|
454
458
|
|
|
455
459
|
return tokenizer
|
|
456
460
|
|
|
457
|
-
@deprecation_warning("with `use_pickle` equals to `True` will be deprecated in future versions")
|
|
458
461
|
def save(self, path: str, use_pickle: bool = False) -> None:
|
|
459
462
|
"""
|
|
460
463
|
Save the tokenizer to the given path.
|
|
@@ -495,6 +498,11 @@ class SequenceTokenizer:
|
|
|
495
498
|
with open(base_path / "init_args.json", "w+") as file:
|
|
496
499
|
json.dump(tokenizer_dict, file)
|
|
497
500
|
else:
|
|
501
|
+
warnings.warn(
|
|
502
|
+
"with `use_pickle` equals to `True` will be deprecated in future versions",
|
|
503
|
+
DeprecationWarning,
|
|
504
|
+
stacklevel=2,
|
|
505
|
+
)
|
|
498
506
|
with open(path, "wb") as file:
|
|
499
507
|
pickle.dump(self, file)
|
|
500
508
|
|
|
@@ -625,7 +633,7 @@ class _PandasSequenceProcessor(_BaseSequenceProcessor[PandasDataFrame]):
|
|
|
625
633
|
"""
|
|
626
634
|
:returns: processed Pandas DataFrame with all features from tensor schema.
|
|
627
635
|
"""
|
|
628
|
-
all_features:
|
|
636
|
+
all_features: dict[str, Union[np.ndarray, list[np.ndarray]]] = {}
|
|
629
637
|
all_features[self._query_id_column] = self._grouped_interactions[self._query_id_column].values
|
|
630
638
|
|
|
631
639
|
for tensor_feature_name in self._tensor_schema:
|
|
@@ -635,7 +643,7 @@ class _PandasSequenceProcessor(_BaseSequenceProcessor[PandasDataFrame]):
|
|
|
635
643
|
|
|
636
644
|
def _process_num_interaction_feature(
|
|
637
645
|
self, tensor_feature: "TensorFeatureInfo"
|
|
638
|
-
) -> Union[
|
|
646
|
+
) -> Union[list[np.ndarray], list[list]]:
|
|
639
647
|
"""
|
|
640
648
|
Process numerical interaction feature.
|
|
641
649
|
|
|
@@ -656,7 +664,7 @@ class _PandasSequenceProcessor(_BaseSequenceProcessor[PandasDataFrame]):
|
|
|
656
664
|
values.append(np.array(sequence))
|
|
657
665
|
return values
|
|
658
666
|
|
|
659
|
-
def _process_num_item_feature(self, tensor_feature: "TensorFeatureInfo") -> Union[
|
|
667
|
+
def _process_num_item_feature(self, tensor_feature: "TensorFeatureInfo") -> Union[list[np.ndarray], list[list]]:
|
|
660
668
|
"""
|
|
661
669
|
Process numerical feature from item features dataset.
|
|
662
670
|
|
|
@@ -682,7 +690,7 @@ class _PandasSequenceProcessor(_BaseSequenceProcessor[PandasDataFrame]):
|
|
|
682
690
|
|
|
683
691
|
return values
|
|
684
692
|
|
|
685
|
-
def _process_num_query_feature(self, tensor_feature: "TensorFeatureInfo") ->
|
|
693
|
+
def _process_num_query_feature(self, tensor_feature: "TensorFeatureInfo") -> list[np.ndarray]:
|
|
686
694
|
"""
|
|
687
695
|
Process numerical feature from query features dataset.
|
|
688
696
|
|
|
@@ -694,7 +702,7 @@ class _PandasSequenceProcessor(_BaseSequenceProcessor[PandasDataFrame]):
|
|
|
694
702
|
|
|
695
703
|
def _process_cat_interaction_feature(
|
|
696
704
|
self, tensor_feature: "TensorFeatureInfo"
|
|
697
|
-
) -> Union[
|
|
705
|
+
) -> Union[list[np.ndarray], list[list]]:
|
|
698
706
|
"""
|
|
699
707
|
Process categorical interaction feature.
|
|
700
708
|
|
|
@@ -715,7 +723,7 @@ class _PandasSequenceProcessor(_BaseSequenceProcessor[PandasDataFrame]):
|
|
|
715
723
|
values.append(np.array(sequence))
|
|
716
724
|
return values
|
|
717
725
|
|
|
718
|
-
def _process_cat_query_feature(self, tensor_feature: "TensorFeatureInfo") ->
|
|
726
|
+
def _process_cat_query_feature(self, tensor_feature: "TensorFeatureInfo") -> list[np.ndarray]:
|
|
719
727
|
"""
|
|
720
728
|
Process categorical feature from query features dataset.
|
|
721
729
|
|
|
@@ -744,7 +752,7 @@ class _PandasSequenceProcessor(_BaseSequenceProcessor[PandasDataFrame]):
|
|
|
744
752
|
]
|
|
745
753
|
return [np.array([query_feature[i]]).reshape(-1) for i in range(len(self._grouped_interactions))]
|
|
746
754
|
|
|
747
|
-
def _process_cat_item_feature(self, tensor_feature: "TensorFeatureInfo") -> Union[
|
|
755
|
+
def _process_cat_item_feature(self, tensor_feature: "TensorFeatureInfo") -> Union[list[np.ndarray], list[list]]:
|
|
748
756
|
"""
|
|
749
757
|
Process categorical feature from item features dataset.
|
|
750
758
|
|
|
@@ -760,7 +768,7 @@ class _PandasSequenceProcessor(_BaseSequenceProcessor[PandasDataFrame]):
|
|
|
760
768
|
assert source is not None
|
|
761
769
|
|
|
762
770
|
item_feature = self._item_features[source.column]
|
|
763
|
-
values:
|
|
771
|
+
values: list[np.ndarray] = []
|
|
764
772
|
|
|
765
773
|
for item_id_sequence in self._grouped_interactions[self._item_id_column]:
|
|
766
774
|
feature_sequence = item_feature.loc[item_id_sequence].values
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import abc
|
|
2
2
|
import json
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import TYPE_CHECKING,
|
|
4
|
+
from typing import TYPE_CHECKING, Union
|
|
5
5
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pandas as pd
|
|
@@ -90,7 +90,7 @@ class SequentialDataset(abc.ABC):
|
|
|
90
90
|
@staticmethod
|
|
91
91
|
def keep_common_query_ids(
|
|
92
92
|
lhs: "SequentialDataset", rhs: "SequentialDataset"
|
|
93
|
-
) ->
|
|
93
|
+
) -> tuple["SequentialDataset", "SequentialDataset"]:
|
|
94
94
|
"""
|
|
95
95
|
Returns `SequentialDatasets` that contain query ids from both datasets.
|
|
96
96
|
|
|
@@ -110,17 +110,27 @@ class SequentialDataset(abc.ABC):
|
|
|
110
110
|
|
|
111
111
|
sequential_dict = {}
|
|
112
112
|
sequential_dict["_class_name"] = self.__class__.__name__
|
|
113
|
-
|
|
113
|
+
|
|
114
|
+
df = SequentialDataset._convert_array_to_list(self._sequences)
|
|
115
|
+
df.reset_index().to_parquet(base_path / "sequences.parquet")
|
|
114
116
|
sequential_dict["init_args"] = {
|
|
115
117
|
"tensor_schema": self._tensor_schema._get_object_args(),
|
|
116
118
|
"query_id_column": self._query_id_column,
|
|
117
119
|
"item_id_column": self._item_id_column,
|
|
118
|
-
"sequences_path": "sequences.
|
|
120
|
+
"sequences_path": "sequences.parquet",
|
|
119
121
|
}
|
|
120
122
|
|
|
121
123
|
with open(base_path / "init_args.json", "w+") as file:
|
|
122
124
|
json.dump(sequential_dict, file)
|
|
123
125
|
|
|
126
|
+
@staticmethod
|
|
127
|
+
def _convert_array_to_list(df):
|
|
128
|
+
return df.map(lambda x: x.tolist() if isinstance(x, np.ndarray) else x)
|
|
129
|
+
|
|
130
|
+
@staticmethod
|
|
131
|
+
def _convert_list_to_array(df):
|
|
132
|
+
return df.map(lambda x: np.array(x) if isinstance(x, list) else x)
|
|
133
|
+
|
|
124
134
|
|
|
125
135
|
class PandasSequentialDataset(SequentialDataset):
|
|
126
136
|
"""
|
|
@@ -149,7 +159,7 @@ class PandasSequentialDataset(SequentialDataset):
|
|
|
149
159
|
if sequences.index.name != query_id_column:
|
|
150
160
|
sequences = sequences.set_index(query_id_column)
|
|
151
161
|
|
|
152
|
-
self._sequences = sequences
|
|
162
|
+
self._sequences = SequentialDataset._convert_list_to_array(sequences)
|
|
153
163
|
|
|
154
164
|
def __len__(self) -> int:
|
|
155
165
|
return len(self._sequences)
|
|
@@ -203,10 +213,11 @@ class PandasSequentialDataset(SequentialDataset):
|
|
|
203
213
|
from replay.data.nn import TensorSchema
|
|
204
214
|
|
|
205
215
|
base_path = Path(path).with_suffix(".replay").resolve()
|
|
206
|
-
with open(base_path / "init_args.json"
|
|
216
|
+
with open(base_path / "init_args.json") as file:
|
|
207
217
|
sequential_dict = json.loads(file.read())
|
|
208
218
|
|
|
209
|
-
sequences = pd.
|
|
219
|
+
sequences = pd.read_parquet(base_path / sequential_dict["init_args"]["sequences_path"])
|
|
220
|
+
sequences = cls._convert_array_to_list(sequences)
|
|
210
221
|
dataset = cls(
|
|
211
222
|
tensor_schema=TensorSchema._create_object_by_args(sequential_dict["init_args"]["tensor_schema"]),
|
|
212
223
|
query_id_column=sequential_dict["init_args"]["query_id_column"],
|
|
@@ -258,18 +269,11 @@ class PolarsSequentialDataset(PandasSequentialDataset):
|
|
|
258
269
|
|
|
259
270
|
def _convert_polars_to_pandas(self, df: PolarsDataFrame) -> PandasDataFrame:
|
|
260
271
|
pandas_df = PandasDataFrame(df.to_dict(as_series=False))
|
|
261
|
-
|
|
262
|
-
for column in pandas_df.select_dtypes(include="object").columns:
|
|
263
|
-
if isinstance(pandas_df[column].iloc[0], list):
|
|
264
|
-
pandas_df[column] = pandas_df[column].apply(lambda x: np.array(x))
|
|
265
|
-
|
|
272
|
+
pandas_df = SequentialDataset._convert_list_to_array(pandas_df)
|
|
266
273
|
return pandas_df
|
|
267
274
|
|
|
268
275
|
def _convert_pandas_to_polars(self, df: PandasDataFrame) -> PolarsDataFrame:
|
|
269
|
-
|
|
270
|
-
if isinstance(df[column].iloc[0], np.ndarray):
|
|
271
|
-
df[column] = df[column].apply(lambda x: x.tolist())
|
|
272
|
-
|
|
276
|
+
df = SequentialDataset._convert_array_to_list(df)
|
|
273
277
|
return pl.from_dict(df.to_dict("list"))
|
|
274
278
|
|
|
275
279
|
@classmethod
|
|
@@ -287,10 +291,10 @@ class PolarsSequentialDataset(PandasSequentialDataset):
|
|
|
287
291
|
from replay.data.nn import TensorSchema
|
|
288
292
|
|
|
289
293
|
base_path = Path(path).with_suffix(".replay").resolve()
|
|
290
|
-
with open(base_path / "init_args.json"
|
|
294
|
+
with open(base_path / "init_args.json") as file:
|
|
291
295
|
sequential_dict = json.loads(file.read())
|
|
292
296
|
|
|
293
|
-
sequences = pl.
|
|
297
|
+
sequences = pl.from_pandas(pd.read_parquet(base_path / sequential_dict["init_args"]["sequences_path"]))
|
|
294
298
|
dataset = cls(
|
|
295
299
|
tensor_schema=TensorSchema._create_object_by_args(sequential_dict["init_args"]["tensor_schema"]),
|
|
296
300
|
query_id_column=sequential_dict["init_args"]["query_id_column"],
|
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
|
|
1
|
+
import warnings
|
|
2
|
+
from collections.abc import Generator, Sequence
|
|
3
|
+
from typing import TYPE_CHECKING, NamedTuple, Optional, Union, cast
|
|
2
4
|
|
|
3
5
|
import numpy as np
|
|
4
6
|
import torch
|
|
5
7
|
from torch.utils.data import Dataset as TorchDataset
|
|
6
8
|
|
|
7
|
-
from replay.utils import deprecation_warning
|
|
8
|
-
|
|
9
9
|
if TYPE_CHECKING:
|
|
10
10
|
from .schema import TensorFeatureInfo, TensorMap, TensorSchema
|
|
11
11
|
from .sequential_dataset import SequentialDataset
|
|
@@ -28,16 +28,12 @@ class TorchSequentialDataset(TorchDataset):
|
|
|
28
28
|
Torch dataset for sequential recommender models
|
|
29
29
|
"""
|
|
30
30
|
|
|
31
|
-
@deprecation_warning(
|
|
32
|
-
"`padding_value` parameter will be removed in future versions. "
|
|
33
|
-
"Instead, you should specify `padding_value` for each column in TensorSchema"
|
|
34
|
-
)
|
|
35
31
|
def __init__(
|
|
36
32
|
self,
|
|
37
33
|
sequential: "SequentialDataset",
|
|
38
34
|
max_sequence_length: int,
|
|
39
35
|
sliding_window_step: Optional[int] = None,
|
|
40
|
-
padding_value: int =
|
|
36
|
+
padding_value: Optional[int] = None,
|
|
41
37
|
) -> None:
|
|
42
38
|
"""
|
|
43
39
|
:param sequential: sequential dataset
|
|
@@ -52,6 +48,15 @@ class TorchSequentialDataset(TorchDataset):
|
|
|
52
48
|
self._sequential = sequential
|
|
53
49
|
self._max_sequence_length = max_sequence_length
|
|
54
50
|
self._sliding_window_step = sliding_window_step
|
|
51
|
+
if padding_value is not None:
|
|
52
|
+
warnings.warn(
|
|
53
|
+
"`padding_value` parameter will be removed in future versions. "
|
|
54
|
+
"Instead, you should specify `padding_value` for each column in TensorSchema",
|
|
55
|
+
DeprecationWarning,
|
|
56
|
+
stacklevel=2,
|
|
57
|
+
)
|
|
58
|
+
else:
|
|
59
|
+
padding_value = 0
|
|
55
60
|
self._padding_value = padding_value
|
|
56
61
|
self._index2sequence_map = self._build_index2sequence_map()
|
|
57
62
|
|
|
@@ -110,7 +115,7 @@ class TorchSequentialDataset(TorchDataset):
|
|
|
110
115
|
return sequence
|
|
111
116
|
|
|
112
117
|
# form shape for padded_sequence. Now supported one and two-dimentions features
|
|
113
|
-
padded_sequence_shape: Union[
|
|
118
|
+
padded_sequence_shape: Union[tuple[int, int], tuple[int]]
|
|
114
119
|
if len(sequence.shape) == 1:
|
|
115
120
|
padded_sequence_shape = (self._max_sequence_length,)
|
|
116
121
|
elif len(sequence.shape) == 2:
|
|
@@ -134,10 +139,10 @@ class TorchSequentialDataset(TorchDataset):
|
|
|
134
139
|
return torch.float32
|
|
135
140
|
assert False, "Unknown tensor feature type"
|
|
136
141
|
|
|
137
|
-
def _build_index2sequence_map(self) -> Sequence[
|
|
142
|
+
def _build_index2sequence_map(self) -> Sequence[tuple[int, int]]:
|
|
138
143
|
return list(self._iter_with_window())
|
|
139
144
|
|
|
140
|
-
def _iter_with_window(self) -> Generator[
|
|
145
|
+
def _iter_with_window(self) -> Generator[tuple[int, int], None, None]:
|
|
141
146
|
for i in range(len(self._sequential)):
|
|
142
147
|
actual_seq_len = self._sequential.get_sequence_length(i)
|
|
143
148
|
left_seq_len = actual_seq_len - self._max_sequence_length
|
|
@@ -176,17 +181,13 @@ class TorchSequentialValidationDataset(TorchDataset):
|
|
|
176
181
|
Torch dataset for sequential recommender models that additionally stores ground truth
|
|
177
182
|
"""
|
|
178
183
|
|
|
179
|
-
@deprecation_warning(
|
|
180
|
-
"`padding_value` parameter will be removed in future versions. "
|
|
181
|
-
"Instead, you should specify `padding_value` for each column in TensorSchema"
|
|
182
|
-
)
|
|
183
184
|
def __init__(
|
|
184
185
|
self,
|
|
185
186
|
sequential: "SequentialDataset",
|
|
186
187
|
ground_truth: "SequentialDataset",
|
|
187
188
|
train: "SequentialDataset",
|
|
188
189
|
max_sequence_length: int,
|
|
189
|
-
padding_value: int =
|
|
190
|
+
padding_value: Optional[int] = None,
|
|
190
191
|
sliding_window_step: Optional[int] = None,
|
|
191
192
|
label_feature_name: Optional[str] = None,
|
|
192
193
|
):
|
replay/data/nn/utils.py
CHANGED
replay/data/schema.py
CHANGED
|
@@ -1,18 +1,9 @@
|
|
|
1
|
+
from collections.abc import ItemsView, Iterable, Iterator, KeysView, Mapping, Sequence, ValuesView
|
|
1
2
|
from enum import Enum
|
|
2
3
|
from typing import (
|
|
3
4
|
Callable,
|
|
4
|
-
Dict,
|
|
5
|
-
ItemsView,
|
|
6
|
-
Iterable,
|
|
7
|
-
Iterator,
|
|
8
|
-
KeysView,
|
|
9
|
-
List,
|
|
10
|
-
Mapping,
|
|
11
5
|
Optional,
|
|
12
|
-
Sequence,
|
|
13
|
-
Set,
|
|
14
6
|
Union,
|
|
15
|
-
ValuesView,
|
|
16
7
|
)
|
|
17
8
|
|
|
18
9
|
|
|
@@ -162,7 +153,7 @@ class FeatureSchema(Mapping[str, FeatureInfo]):
|
|
|
162
153
|
in original schema to keep in subset.
|
|
163
154
|
:returns: new feature schema of given features.
|
|
164
155
|
"""
|
|
165
|
-
features:
|
|
156
|
+
features: set[FeatureInfo] = set()
|
|
166
157
|
for feature_column in features_to_keep:
|
|
167
158
|
if feature_column in self._features_schema:
|
|
168
159
|
features.add(self._features_schema[feature_column])
|
|
@@ -438,7 +429,7 @@ class FeatureSchema(Mapping[str, FeatureInfo]):
|
|
|
438
429
|
"""
|
|
439
430
|
unique_columns = set()
|
|
440
431
|
duplicates = set()
|
|
441
|
-
item_query_names:
|
|
432
|
+
item_query_names: dict[FeatureHint, list[str]] = {
|
|
442
433
|
FeatureHint.ITEM_ID: [],
|
|
443
434
|
FeatureHint.QUERY_ID: [],
|
|
444
435
|
}
|