PyPI - nextrec - Versions diffs - 0.3.6__py3-none-any.whl → 0.4.2__py3-none-any.whl - Mend

nextrec 0.3.6py3-none-any.whl → 0.4.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

nextrec/__init__.py +1 -1
nextrec/__version__.py +1 -1
nextrec/basic/activation.py +10 -5
nextrec/basic/callback.py +1 -0
nextrec/basic/features.py +30 -22
nextrec/basic/layers.py +244 -113
nextrec/basic/loggers.py +62 -43
nextrec/basic/metrics.py +268 -119
nextrec/basic/model.py +1373 -443
nextrec/basic/session.py +10 -3
nextrec/cli.py +498 -0
nextrec/data/__init__.py +19 -25
nextrec/data/batch_utils.py +11 -3
nextrec/data/data_processing.py +42 -24
nextrec/data/data_utils.py +26 -15
nextrec/data/dataloader.py +303 -96
nextrec/data/preprocessor.py +320 -199
nextrec/loss/listwise.py +17 -9
nextrec/loss/loss_utils.py +7 -8
nextrec/loss/pairwise.py +2 -0
nextrec/loss/pointwise.py +30 -12
nextrec/models/generative/hstu.py +106 -40
nextrec/models/match/dssm.py +82 -69
nextrec/models/match/dssm_v2.py +72 -58
nextrec/models/match/mind.py +175 -108
nextrec/models/match/sdm.py +104 -88
nextrec/models/match/youtube_dnn.py +73 -60
nextrec/models/multi_task/esmm.py +53 -39
nextrec/models/multi_task/mmoe.py +70 -47
nextrec/models/multi_task/ple.py +107 -50
nextrec/models/multi_task/poso.py +121 -41
nextrec/models/multi_task/share_bottom.py +54 -38
nextrec/models/ranking/afm.py +172 -45
nextrec/models/ranking/autoint.py +84 -61
nextrec/models/ranking/dcn.py +59 -42
nextrec/models/ranking/dcn_v2.py +64 -23
nextrec/models/ranking/deepfm.py +36 -26
nextrec/models/ranking/dien.py +158 -102
nextrec/models/ranking/din.py +88 -60
nextrec/models/ranking/fibinet.py +55 -35
nextrec/models/ranking/fm.py +32 -26
nextrec/models/ranking/masknet.py +95 -34
nextrec/models/ranking/pnn.py +34 -31
nextrec/models/ranking/widedeep.py +37 -29
nextrec/models/ranking/xdeepfm.py +63 -41
nextrec/utils/__init__.py +61 -32
nextrec/utils/config.py +490 -0
nextrec/utils/device.py +52 -12
nextrec/utils/distributed.py +141 -0
nextrec/utils/embedding.py +1 -0
nextrec/utils/feature.py +1 -0
nextrec/utils/file.py +32 -11
nextrec/utils/initializer.py +61 -16
nextrec/utils/optimizer.py +25 -9
nextrec/utils/synthetic_data.py +531 -0
nextrec/utils/tensor.py +24 -13
{nextrec-0.3.6.dist-info → nextrec-0.4.2.dist-info}/METADATA +15 -5
nextrec-0.4.2.dist-info/RECORD +69 -0
nextrec-0.4.2.dist-info/entry_points.txt +2 -0
nextrec-0.3.6.dist-info/RECORD +0 -64
{nextrec-0.3.6.dist-info → nextrec-0.4.2.dist-info}/WHEEL +0 -0
{nextrec-0.3.6.dist-info → nextrec-0.4.2.dist-info}/licenses/LICENSE +0 -0

nextrec/data/preprocessor.py CHANGED Viewed

@@ -5,6 +5,7 @@ Date: create on 13/11/2025
 Checkpoint: edit on 02/12/2025
 Author: Yang Zhou, zyaztec@gmail.com
 """
 from __future__ import annotations
 import os
 import pickle
@@ -16,13 +17,25 @@ import pandas as pd
 import tqdm
 from pathlib import Path
 from typing import Dict, Union, Optional, Literal, Any
-from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler, LabelEncoder
+from sklearn.preprocessing import (
+    StandardScaler,
+    MinMaxScaler,
+    RobustScaler,
+    MaxAbsScaler,
+    LabelEncoder,
+)
 from nextrec.basic.features import FeatureSet
 from nextrec.basic.loggers import colorize
 from nextrec.basic.session import resolve_save_path
-from nextrec.utils.file import resolve_file_paths, iter_file_chunks, read_table, load_dataframes, default_output_dir
+from nextrec.utils.file import (
+    resolve_file_paths,
+    iter_file_chunks,
+    read_table,
+    load_dataframes,
+    default_output_dir,
+)
 from nextrec.__version__ import __version__
@@ -36,164 +49,179 @@ class DataProcessor(FeatureSet):
         self.version = __version__
         self.is_fitted = False
-        self._transform_summary_printed = False  # Track if summary has been printed during transform
+        self._transform_summary_printed = (
+            False  # Track if summary has been printed during transform
+        )
         self.scalers: Dict[str, Any] = {}
         self.label_encoders: Dict[str, LabelEncoder] = {}
         self.target_encoders: Dict[str, Dict[str, int]] = {}
         self.set_target_id([], [])
     def add_numeric_feature(
-        self,
-        name: str,
-        scaler: Optional[Literal['standard', 'minmax', 'robust', 'maxabs', 'log', 'none']] = 'standard',
-        fill_na: Optional[float] = None
+        self,
+        name: str,
+        scaler: Optional[
+            Literal["standard", "minmax", "robust", "maxabs", "log", "none"]
+        ] = "standard",
+        fill_na: Optional[float] = None,
     ):
-        self.numeric_features[name] = {
-            'scaler': scaler,
-            'fill_na': fill_na
-        }
+        self.numeric_features[name] = {"scaler": scaler, "fill_na": fill_na}
     def add_sparse_feature(
-        self,
-        name: str,
-        encode_method: Literal['hash', 'label'] = 'label',
+        self,
+        name: str,
+        encode_method: Literal["hash", "label"] = "label",
         hash_size: Optional[int] = None,
-        fill_na: str = '<UNK>'
+        fill_na: str = "<UNK>",
     ):
-        if encode_method == 'hash' and hash_size is None:
+        if encode_method == "hash" and hash_size is None:
             raise ValueError("hash_size must be specified when encode_method='hash'")
         self.sparse_features[name] = {
-            'encode_method': encode_method,
-            'hash_size': hash_size,
-            'fill_na': fill_na
+            "encode_method": encode_method,
+            "hash_size": hash_size,
+            "fill_na": fill_na,
         }
     def add_sequence_feature(
-        self,
+        self,
         name: str,
-        encode_method: Literal['hash', 'label'] = 'label',
+        encode_method: Literal["hash", "label"] = "label",
         hash_size: Optional[int] = None,
         max_len: Optional[int] = 50,
         pad_value: int = 0,
-        truncate: Literal['pre', 'post'] = 'pre',           # pre: keep last max_len items, post: keep first max_len items
-        separator: str = ','
+        truncate: Literal[
+            "pre", "post"
+        ] = "pre",  # pre: keep last max_len items, post: keep first max_len items
+        separator: str = ",",
     ):
-        if encode_method == 'hash' and hash_size is None:
+        if encode_method == "hash" and hash_size is None:
             raise ValueError("hash_size must be specified when encode_method='hash'")
         self.sequence_features[name] = {
-            'encode_method': encode_method,
-            'hash_size': hash_size,
-            'max_len': max_len,
-            'pad_value': pad_value,
-            'truncate': truncate,
-            'separator': separator
+            "encode_method": encode_method,
+            "hash_size": hash_size,
+            "max_len": max_len,
+            "pad_value": pad_value,
+            "truncate": truncate,
+            "separator": separator,
         }
     def add_target(
-        self,
-        name: str,                                                                # example: 'click'
-        target_type: Literal['binary', 'multiclass', 'regression'] = 'binary',
-        label_map: Optional[Dict[str, int]] = None                                # example: {'click': 1, 'no_click': 0}
+        self,
+        name: str,  # example: 'click'
+        target_type: Literal["binary", "multiclass", "regression"] = "binary",
+        label_map: Optional[
+            Dict[str, int]
+        ] = None,  # example: {'click': 1, 'no_click': 0}
     ):
         self.target_features[name] = {
-            'target_type': target_type,
-            'label_map': label_map
+            "target_type": target_type,
+            "label_map": label_map,
         }
         self.set_target_id(list(self.target_features.keys()), [])
     def hash_string(self, s: str, hash_size: int) -> int:
         return int(hashlib.md5(str(s).encode()).hexdigest(), 16) % hash_size
     def process_numeric_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
         name = str(data.name)
-        scaler_type = config['scaler']
-        fill_na = config['fill_na']
+        scaler_type = config["scaler"]
+        fill_na = config["fill_na"]
         if data.isna().any():
             if fill_na is None:
                 # Default use mean value to fill missing values for numeric features
                 fill_na = data.mean()
-            config['fill_na_value'] = fill_na
-        if scaler_type == 'standard':
+            config["fill_na_value"] = fill_na
+        if scaler_type == "standard":
             scaler = StandardScaler()
-        elif scaler_type == 'minmax':
+        elif scaler_type == "minmax":
             scaler = MinMaxScaler()
-        elif scaler_type == 'robust':
+        elif scaler_type == "robust":
             scaler = RobustScaler()
-        elif scaler_type == 'maxabs':
+        elif scaler_type == "maxabs":
             scaler = MaxAbsScaler()
-        elif scaler_type == 'log':
-            scaler = None
-        elif scaler_type == 'none':
+        elif scaler_type == "log":
+            scaler = None
+        elif scaler_type == "none":
             scaler = None
         else:
             raise ValueError(f"Unknown scaler type: {scaler_type}")
-        if scaler is not None and scaler_type != 'log':
-            filled_data = data.fillna(config.get('fill_na_value', 0))
+        if scaler is not None and scaler_type != "log":
+            filled_data = data.fillna(config.get("fill_na_value", 0))
             values = np.array(filled_data.values, dtype=np.float64).reshape(-1, 1)
             scaler.fit(values)
             self.scalers[name] = scaler
-    def process_numeric_feature_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
+    def process_numeric_feature_transform(
+        self, data: pd.Series, config: Dict[str, Any]
+    ) -> np.ndarray:
         logger = logging.getLogger()
         name = str(data.name)
-        scaler_type = config['scaler']
-        fill_na_value = config.get('fill_na_value', 0)
+        scaler_type = config["scaler"]
+        fill_na_value = config.get("fill_na_value", 0)
         filled_data = data.fillna(fill_na_value)
         values = np.array(filled_data.values, dtype=np.float64)
-        if scaler_type == 'log':
+        if scaler_type == "log":
             result = np.log1p(np.maximum(values, 0))
-        elif scaler_type == 'none':
+        elif scaler_type == "none":
             result = values
         else:
             scaler = self.scalers.get(name)
             if scaler is None:
-                logger.warning(f"Scaler for {name} not fitted, returning original values")
+                logger.warning(
+                    f"Scaler for {name} not fitted, returning original values"
+                )
                 result = values
             else:
                 result = scaler.transform(values.reshape(-1, 1)).ravel()
         return result
     def process_sparse_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
         name = str(data.name)
-        encode_method = config['encode_method']
-        fill_na = config['fill_na'] # <UNK>
+        encode_method = config["encode_method"]
+        fill_na = config["fill_na"]  # <UNK>
         filled_data = data.fillna(fill_na).astype(str)
-        if encode_method == 'label':
+        if encode_method == "label":
             le = LabelEncoder()
             le.fit(filled_data)
             self.label_encoders[name] = le
-            config['vocab_size'] = len(le.classes_)
-        elif encode_method == 'hash':
-            config['vocab_size'] = config['hash_size']
-    def process_sparse_feature_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
+            config["vocab_size"] = len(le.classes_)
+        elif encode_method == "hash":
+            config["vocab_size"] = config["hash_size"]
+    def process_sparse_feature_transform(
+        self, data: pd.Series, config: Dict[str, Any]
+    ) -> np.ndarray:
         name = str(data.name)
-        encode_method = config['encode_method']
-        fill_na = config['fill_na']
+        encode_method = config["encode_method"]
+        fill_na = config["fill_na"]
         sparse_series = pd.Series(data, name=name).fillna(fill_na).astype(str)
-        if encode_method == 'label':
+        if encode_method == "label":
             le = self.label_encoders.get(name)
             if le is None:
                 raise ValueError(f"LabelEncoder for {name} not fitted")
-            class_to_idx = config.get('_class_to_idx')
+            class_to_idx = config.get("_class_to_idx")
             if class_to_idx is None:
                 class_to_idx = {cls: idx for idx, cls in enumerate(le.classes_)}
-                config['_class_to_idx'] = class_to_idx
+                config["_class_to_idx"] = class_to_idx
             encoded = sparse_series.map(class_to_idx)
             encoded = encoded.fillna(0).astype(np.int64)
             return encoded.to_numpy()
-        if encode_method == 'hash':
-            hash_size = config['hash_size']
+        if encode_method == "hash":
+            hash_size = config["hash_size"]
             hash_fn = self.hash_string
-            return np.fromiter((hash_fn(v, hash_size) for v in sparse_series.to_numpy()), dtype=np.int64, count=sparse_series.size,)
+            return np.fromiter(
+                (hash_fn(v, hash_size) for v in sparse_series.to_numpy()),
+                dtype=np.int64,
+                count=sparse_series.size,
+            )
         return np.array([], dtype=np.int64)
     def process_sequence_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
         name = str(data.name)
-        encode_method = config['encode_method']
-        separator = config['separator']
-        if encode_method == 'label':
+        encode_method = config["encode_method"]
+        separator = config["separator"]
+        if encode_method == "label":
             all_tokens = set()
             for seq in data:
                 # Skip None, np.nan, and empty strings
@@ -201,9 +229,9 @@ class DataProcessor(FeatureSet):
                     continue
                 if isinstance(seq, (float, np.floating)) and np.isnan(seq):
                     continue
-                if isinstance(seq, str) and seq.strip() == '':
+                if isinstance(seq, str) and seq.strip() == "":
                     continue
                 if isinstance(seq, str):
                     tokens = seq.split(separator)
                 elif isinstance(seq, (list, tuple)):
@@ -214,40 +242,42 @@ class DataProcessor(FeatureSet):
                     continue
                 all_tokens.update(tokens)
             if len(all_tokens) == 0:
-                all_tokens.add('<PAD>')
+                all_tokens.add("<PAD>")
             le = LabelEncoder()
             le.fit(list(all_tokens))
             self.label_encoders[name] = le
-            config['vocab_size'] = len(le.classes_)
-        elif encode_method == 'hash':
-            config['vocab_size'] = config['hash_size']
-    def process_sequence_feature_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
+            config["vocab_size"] = len(le.classes_)
+        elif encode_method == "hash":
+            config["vocab_size"] = config["hash_size"]
+    def process_sequence_feature_transform(
+        self, data: pd.Series, config: Dict[str, Any]
+    ) -> np.ndarray:
         """Optimized sequence transform with preallocation and cached vocab map."""
         name = str(data.name)
-        encode_method = config['encode_method']
-        max_len = config['max_len']
-        pad_value = config['pad_value']
-        truncate = config['truncate']
-        separator = config['separator']
+        encode_method = config["encode_method"]
+        max_len = config["max_len"]
+        pad_value = config["pad_value"]
+        truncate = config["truncate"]
+        separator = config["separator"]
         arr = np.asarray(data, dtype=object)
         n = arr.shape[0]
         output = np.full((n, max_len), pad_value, dtype=np.int64)
         # Shared helpers cached locally for speed and cross-platform consistency
         split_fn = str.split
         is_nan = np.isnan
-        if encode_method == 'label':
+        if encode_method == "label":
             le = self.label_encoders.get(name)
             if le is None:
                 raise ValueError(f"LabelEncoder for {name} not fitted")
-            class_to_idx = config.get('_class_to_idx')
+            class_to_idx = config.get("_class_to_idx")
             if class_to_idx is None:
                 class_to_idx = {cls: idx for idx, cls in enumerate(le.classes_)}
-                config['_class_to_idx'] = class_to_idx
+                config["_class_to_idx"] = class_to_idx
         else:
             class_to_idx = None  # type: ignore
         hash_fn = self.hash_string
-        hash_size = config.get('hash_size')
+        hash_size = config.get("hash_size")
         for i, seq in enumerate(arr):
             # normalize sequence to a list of strings
             tokens = []
@@ -262,30 +292,34 @@ class DataProcessor(FeatureSet):
                 tokens = [str(t) for t in seq]
             else:
                 tokens = []
-            if encode_method == 'label':
+            if encode_method == "label":
                 encoded = [
                     class_to_idx.get(token.strip(), 0)  # type: ignore[union-attr]
                     for token in tokens
-                    if token is not None and token != ''
+                    if token is not None and token != ""
                 ]
-            elif encode_method == 'hash':
+            elif encode_method == "hash":
                 if hash_size is None:
                     raise ValueError("hash_size must be set for hash encoding")
-                encoded = [hash_fn(str(token), hash_size) for token in tokens if str(token).strip()]
+                encoded = [
+                    hash_fn(str(token), hash_size)
+                    for token in tokens
+                    if str(token).strip()
+                ]
             else:
                 encoded = []
             if not encoded:
                 continue
             if len(encoded) > max_len:
-                encoded = encoded[-max_len:] if truncate == 'pre' else encoded[:max_len]
+                encoded = encoded[-max_len:] if truncate == "pre" else encoded[:max_len]
             output[i, : len(encoded)] = encoded
         return output
     def process_target_fit(self, data: pd.Series, config: Dict[str, Any]):
         name = str(data.name)
-        target_type = config['target_type']
-        label_map = config.get('label_map')
-        if target_type in ['binary', 'multiclass']:
+        target_type = config["target_type"]
+        label_map = config.get("label_map")
+        if target_type in ["binary", "multiclass"]:
             if label_map is None:
                 unique_values = data.dropna().unique()
                 sorted_values = sorted(unique_values)
@@ -294,23 +328,27 @@ class DataProcessor(FeatureSet):
                     if int_values == list(range(len(int_values))):
                         label_map = {str(val): int(val) for val in sorted_values}
                     else:
-                        label_map = {str(val): idx for idx, val in enumerate(sorted_values)}
+                        label_map = {
+                            str(val): idx for idx, val in enumerate(sorted_values)
+                        }
                 except (ValueError, TypeError):
                     label_map = {str(val): idx for idx, val in enumerate(sorted_values)}
-                config['label_map'] = label_map
+                config["label_map"] = label_map
             self.target_encoders[name] = label_map
-    def process_target_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
+    def process_target_transform(
+        self, data: pd.Series, config: Dict[str, Any]
+    ) -> np.ndarray:
         logger = logging.getLogger()
         name = str(data.name)
-        target_type = config.get('target_type')
-        if target_type == 'regression':
+        target_type = config.get("target_type")
+        if target_type == "regression":
             values = np.array(data.values, dtype=np.float32)
             return values
         else:
             label_map = self.target_encoders.get(name)
             if label_map is None:
-                raise ValueError(f"Target encoder for {name} not fitted")
+                raise ValueError(f"Target encoder for {name} not fitted")
             result = []
             for val in data:
                 str_val = str(val)
@@ -319,8 +357,10 @@ class DataProcessor(FeatureSet):
                 else:
                     logger.warning(f"Unknown target value: {val}, mapping to 0")
                     result.append(0)
-            return np.array(result, dtype=np.int64 if target_type == 'multiclass' else np.float32)
+            return np.array(
+                result, dtype=np.int64 if target_type == "multiclass" else np.float32
+            )
     def load_dataframe_from_path(self, path: str) -> pd.DataFrame:
         """Load all data from a file or directory path into a single DataFrame."""
         file_paths, file_type = resolve_file_paths(path)
@@ -340,10 +380,16 @@ class DataProcessor(FeatureSet):
             return [str(v) for v in value]
         return [str(value)]
-    def fit_from_path(self, path: str, chunk_size: int) -> 'DataProcessor':
+    def fit_from_path(self, path: str, chunk_size: int) -> "DataProcessor":
         """Fit processor statistics by streaming files to reduce memory usage."""
         logger = logging.getLogger()
-        logger.info(colorize("Fitting DataProcessor (streaming path mode)...", color="cyan", bold=True))
+        logger.info(
+            colorize(
+                "Fitting DataProcessor (streaming path mode)...",
+                color="cyan",
+                bold=True,
+            )
+        )
         file_paths, file_type = resolve_file_paths(path)
         numeric_acc: Dict[str, Dict[str, float]] = {}
@@ -356,9 +402,15 @@ class DataProcessor(FeatureSet):
                 "max": -np.inf,
                 "max_abs": 0.0,
             }
-        sparse_vocab: Dict[str, set[str]] = {name: set() for name in self.sparse_features.keys()}
-        seq_vocab: Dict[str, set[str]] = {name: set() for name in self.sequence_features.keys()}
-        target_values: Dict[str, set[Any]] = {name: set() for name in self.target_features.keys()}
+        sparse_vocab: Dict[str, set[str]] = {
+            name: set() for name in self.sparse_features.keys()
+        }
+        seq_vocab: Dict[str, set[str]] = {
+            name: set() for name in self.sequence_features.keys()
+        }
+        target_values: Dict[str, set[Any]] = {
+            name: set() for name in self.target_features.keys()
+        }
         missing_features = set()
         for file_path in file_paths:
             for chunk in iter_file_chunks(file_path, file_type, chunk_size):
@@ -410,12 +462,16 @@ class DataProcessor(FeatureSet):
                     vals = chunk[name].dropna().tolist()
                     target_values[name].update(vals)
         if missing_features:
-            logger.warning(f"The following configured features were not found in provided files: {sorted(missing_features)}")
+            logger.warning(
+                f"The following configured features were not found in provided files: {sorted(missing_features)}"
+            )
         # finalize numeric scalers
         for name, config in self.numeric_features.items():
             acc = numeric_acc[name]
             if acc["count"] == 0:
-                logger.warning(f"Numeric feature {name} has no valid values in provided files")
+                logger.warning(
+                    f"Numeric feature {name} has no valid values in provided files"
+                )
                 continue
             mean_val = acc["sum"] / acc["count"]
             if config["fill_na"] is not None:
@@ -428,7 +484,9 @@ class DataProcessor(FeatureSet):
                 scaler = StandardScaler()
                 scaler.mean_ = np.array([mean_val], dtype=np.float64)
                 scaler.var_ = np.array([var], dtype=np.float64)
-                scaler.scale_ = np.array([np.sqrt(var) if var > 0 else 1.0], dtype=np.float64)
+                scaler.scale_ = np.array(
+                    [np.sqrt(var) if var > 0 else 1.0], dtype=np.float64
+                )
                 scaler.n_samples_seen_ = np.array([int(acc["count"])], dtype=np.int64)
                 self.scalers[name] = scaler
             elif scaler_type == "minmax":
@@ -503,15 +561,25 @@ class DataProcessor(FeatureSet):
                         if int_values == list(range(len(int_values))):
                             label_map = {str(val): int(val) for val in sorted_values}
                         else:
-                            label_map = {str(val): idx for idx, val in enumerate(sorted_values)}
+                            label_map = {
+                                str(val): idx for idx, val in enumerate(sorted_values)
+                            }
                     except (ValueError, TypeError):
-                        label_map = {str(val): idx for idx, val in enumerate(sorted_values)}
+                        label_map = {
+                            str(val): idx for idx, val in enumerate(sorted_values)
+                        }
                     config["label_map"] = label_map
                 self.target_encoders[name] = label_map
         self.is_fitted = True
-        logger.info(colorize("DataProcessor fitted successfully (streaming path mode)", color="green", bold=True))
+        logger.info(
+            colorize(
+                "DataProcessor fitted successfully (streaming path mode)",
+                color="green",
+                bold=True,
+            )
+        )
         return self
     def transform_in_memory(
@@ -522,7 +590,7 @@ class DataProcessor(FeatureSet):
         save_format: Optional[Literal["csv", "parquet"]],
         output_path: Optional[str],
     ) -> Union[pd.DataFrame, Dict[str, np.ndarray]]:
-        logger = logging.getLogger()
+        logger = logging.getLogger()
         # Convert input to dict format for unified processing
         if isinstance(data, pd.DataFrame):
             data_dict = {col: data[col] for col in data.columns}
@@ -530,7 +598,7 @@ class DataProcessor(FeatureSet):
             data_dict = data
         else:
             raise ValueError(f"Unsupported data type: {type(data)}")
         result_dict = {}
         for key, value in data_dict.items():
             if isinstance(value, pd.Series):
@@ -587,7 +655,7 @@ class DataProcessor(FeatureSet):
                 else:
                     columns_dict[key] = value
             return pd.DataFrame(columns_dict)
         if save_format not in [None, "csv", "parquet"]:
             raise ValueError("save_format must be either 'csv', 'parquet', or None")
         effective_format = save_format
@@ -598,7 +666,9 @@ class DataProcessor(FeatureSet):
             result_df = dict_to_dataframe(result_dict)
         if persist:
             if output_path is None:
-                raise ValueError("output_path must be provided when persisting transformed data.")
+                raise ValueError(
+                    "output_path must be provided when persisting transformed data."
+                )
             output_dir = Path(output_path)
             if output_dir.suffix:
                 output_dir = output_dir.parent
@@ -609,7 +679,11 @@ class DataProcessor(FeatureSet):
                 result_df.to_parquet(save_path, index=False)
             else:
                 result_df.to_csv(save_path, index=False)
-            logger.info(colorize(f"Transformed data saved to: {save_path.resolve()}", color="green"))
+            logger.info(
+                colorize(
+                    f"Transformed data saved to: {save_path.resolve()}", color="green"
+                )
+            )
         if return_dict:
             return result_dict
         assert result_df is not None, "DataFrame is None after transform"
@@ -627,7 +701,9 @@ class DataProcessor(FeatureSet):
         target_format = save_format or file_type
         if target_format not in ["csv", "parquet"]:
             raise ValueError("save_format must be either 'csv' or 'parquet'")
-        base_output_dir = Path(output_path) if output_path else default_output_dir(input_path)
+        base_output_dir = (
+            Path(output_path) if output_path else default_output_dir(input_path)
+        )
         if base_output_dir.suffix:
             base_output_dir = base_output_dir.parent
         output_root = base_output_dir / "transformed_data"
@@ -635,8 +711,12 @@ class DataProcessor(FeatureSet):
         saved_paths = []
         for file_path in tqdm.tqdm(file_paths, desc="Transforming files", unit="file"):
             df = read_table(file_path, file_type)
-            transformed_df = self.transform_in_memory(df, return_dict=False, persist=False, save_format=None, output_path=None)
-            assert isinstance(transformed_df, pd.DataFrame), "Expected DataFrame when return_dict=False"
+            transformed_df = self.transform_in_memory(
+                df, return_dict=False, persist=False, save_format=None, output_path=None
+            )
+            assert isinstance(
+                transformed_df, pd.DataFrame
+            ), "Expected DataFrame when return_dict=False"
             source_path = Path(file_path)
             target_file = output_root / f"{source_path.stem}.{target_format}"
             if target_format == "csv":
@@ -644,17 +724,30 @@ class DataProcessor(FeatureSet):
             else:
                 transformed_df.to_parquet(target_file, index=False)
             saved_paths.append(str(target_file.resolve()))
-        logger.info(colorize(f"Transformed {len(saved_paths)} file(s) saved to: {output_root.resolve()}", color="green",))
+        logger.info(
+            colorize(
+                f"Transformed {len(saved_paths)} file(s) saved to: {output_root.resolve()}",
+                color="green",
+            )
+        )
         return saved_paths
     # fit is nothing but registering the statistics from data so that we can transform the data later
-    def fit(self, data: Union[pd.DataFrame, Dict[str, Any], str, os.PathLike],chunk_size: int = 200000,):
+    def fit(
+        self,
+        data: Union[pd.DataFrame, Dict[str, Any], str, os.PathLike],
+        chunk_size: int = 200000,
+    ):
         logger = logging.getLogger()
         if isinstance(data, (str, os.PathLike)):
             path_str = str(data)
-            uses_robust = any(cfg.get("scaler") == "robust" for cfg in self.numeric_features.values())
+            uses_robust = any(
+                cfg.get("scaler") == "robust" for cfg in self.numeric_features.values()
+            )
             if uses_robust:
-                logger.warning("Robust scaler requires full data; loading all files into memory. Consider smaller chunk_size or different scaler if memory is limited.")
+                logger.warning(
+                    "Robust scaler requires full data; loading all files into memory. Consider smaller chunk_size or different scaler if memory is limited."
+                )
                 data = self.load_dataframe_from_path(path_str)
             else:
                 return self.fit_from_path(path_str, chunk_size)
@@ -683,9 +776,9 @@ class DataProcessor(FeatureSet):
             self.process_target_fit(data[name], config)
         self.is_fitted = True
         return self
     def transform(
-        self,
+        self,
         data: Union[pd.DataFrame, Dict[str, Any], str, os.PathLike],
         return_dict: bool = True,
         save_format: Optional[Literal["csv", "parquet"]] = None,
@@ -695,12 +788,20 @@ class DataProcessor(FeatureSet):
             raise ValueError("DataProcessor must be fitted before transform")
         if isinstance(data, (str, os.PathLike)):
             if return_dict:
-                raise ValueError("Path transform writes files only; set return_dict=False when passing a path.")
+                raise ValueError(
+                    "Path transform writes files only; set return_dict=False when passing a path."
+                )
             return self.transform_path(str(data), output_path, save_format)
-        return self.transform_in_memory(data=data, return_dict=return_dict, persist=output_path is not None, save_format=save_format, output_path=output_path)
+        return self.transform_in_memory(
+            data=data,
+            return_dict=return_dict,
+            persist=output_path is not None,
+            save_format=save_format,
+            output_path=output_path,
+        )
     def fit_transform(
-        self,
+        self,
         data: Union[pd.DataFrame, Dict[str, Any], str, os.PathLike],
         return_dict: bool = True,
         save_format: Optional[Literal["csv", "parquet"]] = None,
@@ -726,7 +827,7 @@ class DataProcessor(FeatureSet):
             default_dir=Path(os.getcwd()),
             default_name="fitted_processor",
             suffix=".pkl",
-            add_timestamp=False
+            add_timestamp=False,
         )
         state = {
             "numeric_features": self.numeric_features,
@@ -741,117 +842,137 @@ class DataProcessor(FeatureSet):
         }
         with open(target_path, "wb") as f:
             pickle.dump(state, f)
-        logger.info(f"DataProcessor saved to: {target_path}, NextRec version: {self.version}")
+        logger.info(
+            f"DataProcessor saved to: {target_path}, NextRec version: {self.version}"
+        )
     @classmethod
-    def load(cls, load_path: str | Path) -> 'DataProcessor':
+    def load(cls, load_path: str | Path) -> "DataProcessor":
         logger = logging.getLogger()
         load_path = Path(load_path)
-        with open(load_path, 'rb') as f:
+        with open(load_path, "rb") as f:
             state = pickle.load(f)
         processor = cls()
-        processor.numeric_features = state.get('numeric_features', {})
-        processor.sparse_features = state.get('sparse_features', {})
-        processor.sequence_features = state.get('sequence_features', {})
-        processor.target_features = state.get('target_features', {})
-        processor.is_fitted = state.get('is_fitted', False)
-        processor.scalers = state.get('scalers', {})
-        processor.label_encoders = state.get('label_encoders', {})
-        processor.target_encoders = state.get('target_encoders', {})
+        processor.numeric_features = state.get("numeric_features", {})
+        processor.sparse_features = state.get("sparse_features", {})
+        processor.sequence_features = state.get("sequence_features", {})
+        processor.target_features = state.get("target_features", {})
+        processor.is_fitted = state.get("is_fitted", False)
+        processor.scalers = state.get("scalers", {})
+        processor.label_encoders = state.get("label_encoders", {})
+        processor.target_encoders = state.get("target_encoders", {})
         processor.version = state.get("processor_version", "unknown")
-        logger.info(f"DataProcessor loaded from {load_path}, NextRec version: {processor.version}")
+        logger.info(
+            f"DataProcessor loaded from {load_path}, NextRec version: {processor.version}"
+        )
         return processor
     def get_vocab_sizes(self) -> Dict[str, int]:
         vocab_sizes = {}
         for name, config in self.sparse_features.items():
-            vocab_sizes[name] = config.get('vocab_size', 0)
+            vocab_sizes[name] = config.get("vocab_size", 0)
         for name, config in self.sequence_features.items():
-            vocab_sizes[name] = config.get('vocab_size', 0)
+            vocab_sizes[name] = config.get("vocab_size", 0)
         return vocab_sizes
     def summary(self):
         """Print a summary of the DataProcessor configuration."""
         logger = logging.getLogger()
         logger.info(colorize("=" * 80, color="bright_blue", bold=True))
         logger.info(colorize("DataProcessor Summary", color="bright_blue", bold=True))
         logger.info(colorize("=" * 80, color="bright_blue", bold=True))
         logger.info("")
         logger.info(colorize("[1] Feature Configuration", color="cyan", bold=True))
         logger.info(colorize("-" * 80, color="cyan"))
         if self.numeric_features:
             logger.info(f"Dense Features ({len(self.numeric_features)}):")
             max_name_len = max(len(name) for name in self.numeric_features.keys())
             name_width = max(max_name_len, 10) + 2
-            logger.info(f"  {'#':<4} {'Name':<{name_width}} {'Scaler':>15} {'Fill NA':>10}")
+            logger.info(
+                f"  {'#':<4} {'Name':<{name_width}} {'Scaler':>15} {'Fill NA':>10}"
+            )
             logger.info(f"  {'-'*4} {'-'*name_width} {'-'*15} {'-'*10}")
             for i, (name, config) in enumerate(self.numeric_features.items(), 1):
-                scaler = config['scaler']
-                fill_na = config.get('fill_na_value', config.get('fill_na', 'N/A'))
-                logger.info(f"  {i:<4} {name:<{name_width}} {str(scaler):>15} {str(fill_na):>10}")
+                scaler = config["scaler"]
+                fill_na = config.get("fill_na_value", config.get("fill_na", "N/A"))
+                logger.info(
+                    f"  {i:<4} {name:<{name_width}} {str(scaler):>15} {str(fill_na):>10}"
+                )
         if self.sparse_features:
             logger.info(f"Sparse Features ({len(self.sparse_features)}):")
             max_name_len = max(len(name) for name in self.sparse_features.keys())
             name_width = max(max_name_len, 10) + 2
-            logger.info(f"  {'#':<4} {'Name':<{name_width}} {'Method':>12} {'Vocab Size':>12} {'Hash Size':>12}")
+            logger.info(
+                f"  {'#':<4} {'Name':<{name_width}} {'Method':>12} {'Vocab Size':>12} {'Hash Size':>12}"
+            )
             logger.info(f"  {'-'*4} {'-'*name_width} {'-'*12} {'-'*12} {'-'*12}")
             for i, (name, config) in enumerate(self.sparse_features.items(), 1):
-                method = config['encode_method']
-                vocab_size = config.get('vocab_size', 'N/A')
-                hash_size = config.get('hash_size', 'N/A')
-                logger.info(f"  {i:<4} {name:<{name_width}} {str(method):>12} {str(vocab_size):>12} {str(hash_size):>12}")
+                method = config["encode_method"]
+                vocab_size = config.get("vocab_size", "N/A")
+                hash_size = config.get("hash_size", "N/A")
+                logger.info(
+                    f"  {i:<4} {name:<{name_width}} {str(method):>12} {str(vocab_size):>12} {str(hash_size):>12}"
+                )
         if self.sequence_features:
             logger.info(f"Sequence Features ({len(self.sequence_features)}):")
             max_name_len = max(len(name) for name in self.sequence_features.keys())
             name_width = max(max_name_len, 10) + 2
-            logger.info(f"  {'#':<4} {'Name':<{name_width}} {'Method':>12} {'Vocab Size':>12} {'Hash Size':>12} {'Max Len':>10}")
-            logger.info(f"  {'-'*4} {'-'*name_width} {'-'*12} {'-'*12} {'-'*12} {'-'*10}")
+            logger.info(
+                f"  {'#':<4} {'Name':<{name_width}} {'Method':>12} {'Vocab Size':>12} {'Hash Size':>12} {'Max Len':>10}"
+            )
+            logger.info(
+                f"  {'-'*4} {'-'*name_width} {'-'*12} {'-'*12} {'-'*12} {'-'*10}"
+            )
             for i, (name, config) in enumerate(self.sequence_features.items(), 1):
-                method = config['encode_method']
-                vocab_size = config.get('vocab_size', 'N/A')
-                hash_size = config.get('hash_size', 'N/A')
-                max_len = config.get('max_len', 'N/A')
-                logger.info(f"  {i:<4} {name:<{name_width}} {str(method):>12} {str(vocab_size):>12} {str(hash_size):>12} {str(max_len):>10}")
+                method = config["encode_method"]
+                vocab_size = config.get("vocab_size", "N/A")
+                hash_size = config.get("hash_size", "N/A")
+                max_len = config.get("max_len", "N/A")
+                logger.info(
+                    f"  {i:<4} {name:<{name_width}} {str(method):>12} {str(vocab_size):>12} {str(hash_size):>12} {str(max_len):>10}"
+                )
         logger.info("")
         logger.info(colorize("[2] Target Configuration", color="cyan", bold=True))
         logger.info(colorize("-" * 80, color="cyan"))
         if self.target_features:
             logger.info(f"Target Features ({len(self.target_features)}):")
             max_name_len = max(len(name) for name in self.target_features.keys())
             name_width = max(max_name_len, 10) + 2
             logger.info(f"  {'#':<4} {'Name':<{name_width}} {'Type':>15}")
             logger.info(f"  {'-'*4} {'-'*name_width} {'-'*15}")
             for i, (name, config) in enumerate(self.target_features.items(), 1):
-                target_type = config['target_type']
+                target_type = config["target_type"]
                 logger.info(f"  {i:<4} {name:<{name_width}} {str(target_type):>15}")
         else:
             logger.info("No target features configured")
         logger.info("")
         logger.info(colorize("[3] Processor Status", color="cyan", bold=True))
         logger.info(colorize("-" * 80, color="cyan"))
         logger.info(f"Fitted:                  {self.is_fitted}")
-        logger.info(f"Total Features:          {len(self.numeric_features) + len(self.sparse_features) + len(self.sequence_features)}")
+        logger.info(
+            f"Total Features:          {len(self.numeric_features) + len(self.sparse_features) + len(self.sequence_features)}"
+        )
         logger.info(f"  Dense Features:        {len(self.numeric_features)}")
         logger.info(f"  Sparse Features:       {len(self.sparse_features)}")
         logger.info(f"  Sequence Features:     {len(self.sequence_features)}")
         logger.info(f"Target Features:         {len(self.target_features)}")
         logger.info("")
         logger.info("")
         logger.info(colorize("=" * 80, color="bright_blue", bold=True))

nextrec 0.3.6__py3-none-any.whl → 0.4.2__py3-none-any.whl

nextrec 0.3.6py3-none-any.whl → 0.4.2py3-none-any.whl