PyPI - nextrec - Versions diffs - 0.2.4__py3-none-any.whl → 0.2.6__py3-none-any.whl - Mend

nextrec 0.2.4py3-none-any.whl → 0.2.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

nextrec/__version__.py +1 -1
nextrec/basic/features.py +5 -1
nextrec/basic/layers.py +3 -7
nextrec/basic/model.py +495 -664
nextrec/data/data_utils.py +44 -12
nextrec/data/dataloader.py +84 -285
nextrec/data/preprocessor.py +93 -214
nextrec/loss/__init__.py +0 -1
nextrec/loss/loss_utils.py +51 -120
nextrec/models/multi_task/esmm.py +1 -1
nextrec/models/ranking/masknet.py +1 -1
nextrec/utils/__init__.py +4 -1
nextrec/utils/common.py +16 -0
{nextrec-0.2.4.dist-info → nextrec-0.2.6.dist-info}/METADATA +2 -2
{nextrec-0.2.4.dist-info → nextrec-0.2.6.dist-info}/RECORD +17 -16
{nextrec-0.2.4.dist-info → nextrec-0.2.6.dist-info}/WHEEL +0 -0
{nextrec-0.2.4.dist-info → nextrec-0.2.6.dist-info}/licenses/LICENSE +0 -0

nextrec/data/preprocessor.py CHANGED Viewed

@@ -30,8 +30,10 @@ from nextrec.data.data_utils import (
     load_dataframes,
     default_output_dir,
 )
-from nextrec.basic.session import create_session, resolve_save_path
+from nextrec.basic.session import resolve_save_path
 from nextrec.basic.features import FeatureSpecMixin
+from nextrec.__version__ import __version__
 class DataProcessor(FeatureSpecMixin):
     """DataProcessor for data preprocessing including numeric, sparse, sequence features and target processing.
@@ -54,28 +56,21 @@ class DataProcessor(FeatureSpecMixin):
         >>> # Get vocabulary sizes for embedding layers
         >>> vocab_sizes = processor.get_vocab_sizes()
     """
-    def __init__(self, session_id: str | None = None ):
+    def __init__(self):
         self.numeric_features: Dict[str, Dict[str, Any]] = {}
         self.sparse_features: Dict[str, Dict[str, Any]] = {}
         self.sequence_features: Dict[str, Dict[str, Any]] = {}
         self.target_features: Dict[str, Dict[str, Any]] = {}
-        self.session_id = session_id
-        self.session = create_session(session_id)
+        self.version = __version__
         self.is_fitted = False
         self._transform_summary_printed = False  # Track if summary has been printed during transform
         self.scalers: Dict[str, Any] = {}
         self.label_encoders: Dict[str, LabelEncoder] = {}
         self.target_encoders: Dict[str, Dict[str, int]] = {}
-        self._set_target_config([], [])
-        # Initialize logger if not already initialized
-        self._logger_initialized = False
-        if not logging.getLogger().hasHandlers():
-            setup_logger(session_id=self.session_id)
-            self._logger_initialized = True
+        self._set_target_id_config([], [])
     def add_numeric_feature(
         self,
         name: str,
@@ -96,7 +91,6 @@ class DataProcessor(FeatureSpecMixin):
     ):
         if encode_method == 'hash' and hash_size is None:
             raise ValueError("hash_size must be specified when encode_method='hash'")
         self.sparse_features[name] = {
             'encode_method': encode_method,
             'hash_size': hash_size,
@@ -113,10 +107,8 @@ class DataProcessor(FeatureSpecMixin):
         truncate: Literal['pre', 'post'] = 'pre',           # pre: keep last max_len items, post: keep first max_len items
         separator: str = ','
     ):
         if encode_method == 'hash' and hash_size is None:
             raise ValueError("hash_size must be specified when encode_method='hash'")
         self.sequence_features[name] = {
             'encode_method': encode_method,
             'hash_size': hash_size,
@@ -136,23 +128,20 @@ class DataProcessor(FeatureSpecMixin):
             'target_type': target_type,
             'label_map': label_map
         }
-        self._set_target_config(list(self.target_features.keys()), [])
+        self._set_target_id_config(list(self.target_features.keys()), [])
     def _hash_string(self, s: str, hash_size: int) -> int:
         return int(hashlib.md5(str(s).encode()).hexdigest(), 16) % hash_size
     def _process_numeric_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
         name = str(data.name)
         scaler_type = config['scaler']
         fill_na = config['fill_na']
         if data.isna().any():
             if fill_na is None:
                 # Default use mean value to fill missing values for numeric features
                 fill_na = data.mean()
             config['fill_na_value'] = fill_na
         if scaler_type == 'standard':
             scaler = StandardScaler()
         elif scaler_type == 'minmax':
@@ -167,27 +156,19 @@ class DataProcessor(FeatureSpecMixin):
             scaler = None
         else:
             raise ValueError(f"Unknown scaler type: {scaler_type}")
         if scaler is not None and scaler_type != 'log':
             filled_data = data.fillna(config.get('fill_na_value', 0))
             values = np.array(filled_data.values, dtype=np.float64).reshape(-1, 1)
             scaler.fit(values)
             self.scalers[name] = scaler
-    def _process_numeric_feature_transform(
-        self,
-        data: pd.Series,
-        config: Dict[str, Any]
-    ) -> np.ndarray:
+    def _process_numeric_feature_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
         logger = logging.getLogger()
         name = str(data.name)
         scaler_type = config['scaler']
         fill_na_value = config.get('fill_na_value', 0)
         filled_data = data.fillna(fill_na_value)
         values = np.array(filled_data.values, dtype=np.float64)
         if scaler_type == 'log':
             result = np.log1p(np.maximum(values, 0))
         elif scaler_type == 'none':
@@ -199,17 +180,13 @@ class DataProcessor(FeatureSpecMixin):
                 result = values
             else:
                 result = scaler.transform(values.reshape(-1, 1)).ravel()
         return result
     def _process_sparse_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
         name = str(data.name)
         encode_method = config['encode_method']
         fill_na = config['fill_na'] # <UNK>
         filled_data = data.fillna(fill_na).astype(str)
         if encode_method == 'label':
             le = LabelEncoder()
             le.fit(filled_data)
@@ -218,49 +195,32 @@ class DataProcessor(FeatureSpecMixin):
         elif encode_method == 'hash':
             config['vocab_size'] = config['hash_size']
-    def _process_sparse_feature_transform(
-        self,
-        data: pd.Series,
-        config: Dict[str, Any]
-    ) -> np.ndarray:
-        """Fast path sparse feature transform using cached dict mapping or hashing."""
+    def _process_sparse_feature_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
         name = str(data.name)
         encode_method = config['encode_method']
         fill_na = config['fill_na']
         sparse_series = pd.Series(data, name=name).fillna(fill_na).astype(str)
         if encode_method == 'label':
             le = self.label_encoders.get(name)
             if le is None:
                 raise ValueError(f"LabelEncoder for {name} not fitted")
             class_to_idx = config.get('_class_to_idx')
             if class_to_idx is None:
                 class_to_idx = {cls: idx for idx, cls in enumerate(le.classes_)}
                 config['_class_to_idx'] = class_to_idx
             encoded = sparse_series.map(class_to_idx)
             encoded = encoded.fillna(0).astype(np.int64)
             return encoded.to_numpy()
         if encode_method == 'hash':
             hash_size = config['hash_size']
             hash_fn = self._hash_string
-            return np.fromiter(
-                (hash_fn(v, hash_size) for v in sparse_series.to_numpy()),
-                dtype=np.int64,
-                count=sparse_series.size,
-            )
+            return np.fromiter((hash_fn(v, hash_size) for v in sparse_series.to_numpy()), dtype=np.int64, count=sparse_series.size,)
         return np.array([], dtype=np.int64)
     def _process_sequence_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
         name = str(data.name)
         encode_method = config['encode_method']
         separator = config['separator']
         if encode_method == 'label':
             all_tokens = set()
             for seq in data:
@@ -280,12 +240,9 @@ class DataProcessor(FeatureSpecMixin):
                     tokens = [str(t) for t in seq.tolist()]
                 else:
                     continue
                 all_tokens.update(tokens)
             if len(all_tokens) == 0:
                 all_tokens.add('<PAD>')
             le = LabelEncoder()
             le.fit(list(all_tokens))
             self.label_encoders[name] = le
@@ -293,11 +250,7 @@ class DataProcessor(FeatureSpecMixin):
         elif encode_method == 'hash':
             config['vocab_size'] = config['hash_size']
-    def _process_sequence_feature_transform(
-        self,
-        data: pd.Series,
-        config: Dict[str, Any]
-    ) -> np.ndarray:
+    def _process_sequence_feature_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
         """Optimized sequence transform with preallocation and cached vocab map."""
         name = str(data.name)
         encode_method = config['encode_method']
@@ -305,15 +258,12 @@ class DataProcessor(FeatureSpecMixin):
         pad_value = config['pad_value']
         truncate = config['truncate']
         separator = config['separator']
         arr = np.asarray(data, dtype=object)
         n = arr.shape[0]
         output = np.full((n, max_len), pad_value, dtype=np.int64)
         # Shared helpers cached locally for speed and cross-platform consistency
         split_fn = str.split
         is_nan = np.isnan
         if encode_method == 'label':
             le = self.label_encoders.get(name)
             if le is None:
@@ -324,10 +274,8 @@ class DataProcessor(FeatureSpecMixin):
                 config['_class_to_idx'] = class_to_idx
         else:
             class_to_idx = None  # type: ignore
         hash_fn = self._hash_string
         hash_size = config.get('hash_size')
         for i, seq in enumerate(arr):
             # normalize sequence to a list of strings
             tokens = []
@@ -342,14 +290,12 @@ class DataProcessor(FeatureSpecMixin):
                 tokens = [str(t) for t in seq]
             else:
                 tokens = []
             if encode_method == 'label':
                 encoded = [
                     class_to_idx.get(token.strip(), 0)  # type: ignore[union-attr]
                     for token in tokens
                     if token is not None and token != ''
                 ]
             elif encode_method == 'hash':
                 if hash_size is None:
                     raise ValueError("hash_size must be set for hash encoding")
@@ -360,27 +306,21 @@ class DataProcessor(FeatureSpecMixin):
                 ]
             else:
                 encoded = []
             if not encoded:
                 continue
             if len(encoded) > max_len:
                 encoded = encoded[-max_len:] if truncate == 'pre' else encoded[:max_len]
             output[i, : len(encoded)] = encoded
         return output
     def _process_target_fit(self, data: pd.Series, config: Dict[str, Any]):
         name = str(data.name)
         target_type = config['target_type']
-        label_map = config['label_map']
+        label_map = config.get('label_map')
         if target_type in ['binary', 'multiclass']:
             if label_map is None:
                 unique_values = data.dropna().unique()
                 sorted_values = sorted(unique_values)
                 try:
                     int_values = [int(v) for v in sorted_values]
                     if int_values == list(range(len(int_values))):
@@ -389,29 +329,20 @@ class DataProcessor(FeatureSpecMixin):
                         label_map = {str(val): idx for idx, val in enumerate(sorted_values)}
                 except (ValueError, TypeError):
                     label_map = {str(val): idx for idx, val in enumerate(sorted_values)}
                 config['label_map'] = label_map
             self.target_encoders[name] = label_map
-    def _process_target_transform(
-        self,
-        data: pd.Series,
-        config: Dict[str, Any]
-    ) -> np.ndarray:
+    def _process_target_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
         logger = logging.getLogger()
         name = str(data.name)
-        target_type = config['target_type']
+        target_type = config.get('target_type')
         if target_type == 'regression':
             values = np.array(data.values, dtype=np.float32)
             return values
         else:
             label_map = self.target_encoders.get(name)
             if label_map is None:
-                raise ValueError(f"Target encoder for {name} not fitted")
+                raise ValueError(f"Target encoder for {name} not fitted")
             result = []
             for val in data:
                 str_val = str(val)
@@ -420,7 +351,6 @@ class DataProcessor(FeatureSpecMixin):
                 else:
                     logger.warning(f"Unknown target value: {val}, mapping to 0")
                     result.append(0)
             return np.array(result, dtype=np.int64 if target_type == 'multiclass' else np.float32)
     def _load_dataframe_from_path(self, path: str) -> pd.DataFrame:
@@ -458,13 +388,10 @@ class DataProcessor(FeatureSpecMixin):
                 "max": -np.inf,
                 "max_abs": 0.0,
             }
         sparse_vocab: Dict[str, set[str]] = {name: set() for name in self.sparse_features.keys()}
         seq_vocab: Dict[str, set[str]] = {name: set() for name in self.sequence_features.keys()}
         target_values: Dict[str, set[Any]] = {name: set() for name in self.target_features.keys()}
         missing_features = set()
         for file_path in file_paths:
             for chunk in iter_file_chunks(file_path, file_type, chunk_size):
                 # numeric features
@@ -514,25 +441,19 @@ class DataProcessor(FeatureSpecMixin):
                         continue
                     vals = chunk[name].dropna().tolist()
                     target_values[name].update(vals)
         if missing_features:
-            logger.warning(
-                f"The following configured features were not found in provided files: {sorted(missing_features)}"
-            )
+            logger.warning(f"The following configured features were not found in provided files: {sorted(missing_features)}")
         # finalize numeric scalers
         for name, config in self.numeric_features.items():
             acc = numeric_acc[name]
             if acc["count"] == 0:
                 logger.warning(f"Numeric feature {name} has no valid values in provided files")
                 continue
             mean_val = acc["sum"] / acc["count"]
             if config["fill_na"] is not None:
                 config["fill_na_value"] = config["fill_na"]
             else:
                 config["fill_na_value"] = mean_val
             scaler_type = config["scaler"]
             if scaler_type == "standard":
                 var = max(acc["sumsq"] / acc["count"] - mean_val * mean_val, 0.0)
@@ -550,6 +471,11 @@ class DataProcessor(FeatureSpecMixin):
                 scaler.data_max_ = np.array([data_max], dtype=np.float64)
                 scaler.data_range_ = scaler.data_max_ - scaler.data_min_
                 scaler.data_range_[scaler.data_range_ == 0] = 1.0
+                # Manually set scale_/min_ for streaming fit to mirror sklearn's internal fit logic
+                feature_min, feature_max = scaler.feature_range
+                scale = (feature_max - feature_min) / scaler.data_range_
+                scaler.scale_ = scale
+                scaler.min_ = feature_min - scaler.data_min_ * scale
                 scaler.n_samples_seen_ = np.array([int(acc["count"])], dtype=np.int64)
                 self.scalers[name] = scaler
             elif scaler_type == "maxabs":
@@ -626,9 +552,9 @@ class DataProcessor(FeatureSpecMixin):
         return_dict: bool,
         persist: bool,
         save_format: Optional[Literal["csv", "parquet"]],
+        output_path: Optional[str],
     ) -> Union[pd.DataFrame, Dict[str, np.ndarray]]:
-        logger = logging.getLogger()
+        logger = logging.getLogger()
         # Convert input to dict format for unified processing
         if isinstance(data, pd.DataFrame):
             data_dict = {col: data[col] for col in data.columns}
@@ -688,173 +614,133 @@ class DataProcessor(FeatureSpecMixin):
             columns_dict = {}
             for key, value in result.items():
                 if key in self.sequence_features:
-                    columns_dict[key] = [list(seq) for seq in value]
+                    # Use tolist to coerce numpy scalars to native Python ints for stable CSV rendering
+                    columns_dict[key] = [np.asarray(seq).tolist() for seq in value]
                 else:
                     columns_dict[key] = value
             return pd.DataFrame(columns_dict)
-        assert save_format in [None, "csv", "parquet"], "save_format must be either 'csv', 'parquet', or None"
-        if persist and save_format is None:
-            save_format = "parquet"
+        if save_format not in [None, "csv", "parquet"]:
+            raise ValueError("save_format must be either 'csv', 'parquet', or None")
+        effective_format = save_format
+        if persist:
+            effective_format = save_format or "parquet"
         result_df = None
-        if (not return_dict) or (save_format is not None):
+        if (not return_dict) or persist:
             result_df = _dict_to_dataframe(result_dict)
-            assert result_df is not None, "DataFrame is None after transform"
-        if save_format is not None:
-            save_path = resolve_save_path(
-                path=None,
-                default_dir=self.session_dir / "processor" / "preprocessed_data",
-                default_name="data_processed",
-                suffix=f".{save_format}",
-                add_timestamp=True,
-            )
-            if save_format == "parquet":
+        if persist:
+            if output_path is None:
+                raise ValueError("output_path must be provided when persisting transformed data.")
+            output_dir = Path(output_path)
+            if output_dir.suffix:
+                output_dir = output_dir.parent
+            output_dir.mkdir(parents=True, exist_ok=True)
+            save_path = output_dir / f"transformed_data.{effective_format}"
+            assert result_df is not None, "DataFrame conversion failed"
+            if effective_format == "parquet":
                 result_df.to_parquet(save_path, index=False)
             else:
                 result_df.to_csv(save_path, index=False)
-            logger.info(colorize(
-                f"Transformed data saved to: {save_path}",
-                color="green"
-            ))
+            logger.info(colorize(f"Transformed data saved to: {save_path.resolve()}", color="green"))
         if return_dict:
             return result_dict
+        assert result_df is not None, "DataFrame is None after transform"
         return result_df
-    def _transform_path(self, path: str, output_path: Optional[str]) -> list[str]:
+    def _transform_path(
+        self,
+        input_path: str,
+        output_path: Optional[str],
+        save_format: Optional[Literal["csv", "parquet"]],
+    ) -> list[str]:
         """Transform data from files under a path and save them to a new location."""
         logger = logging.getLogger()
-        file_paths, file_type = resolve_file_paths(path)
-        default_root = self.session_dir / "processor" / default_output_dir(path).name
-        output_root = default_root
-        target_file_override: Optional[Path] = None
-        if output_path:
-            output_path_obj = Path(output_path)
-            if not output_path_obj.is_absolute():
-                output_path_obj = self.session_dir / output_path_obj
-            if output_path_obj.suffix.lower() in {".csv", ".parquet"}:
-                if len(file_paths) != 1:
-                    raise ValueError("output_path points to a file but multiple input files were provided.")
-                target_file_override = output_path_obj
-                output_root = output_path_obj.parent
-            else:
-                output_root = output_path_obj
+        file_paths, file_type = resolve_file_paths(input_path)
+        target_format = save_format or file_type
+        if target_format not in ["csv", "parquet"]:
+            raise ValueError("save_format must be either 'csv' or 'parquet'")
+        base_output_dir = Path(output_path) if output_path else default_output_dir(input_path)
+        if base_output_dir.suffix:
+            base_output_dir = base_output_dir.parent
+        output_root = base_output_dir / "transformed_data"
         output_root.mkdir(parents=True, exist_ok=True)
-        saved_paths: list[str] = []
+        saved_paths = []
         for file_path in file_paths:
             df = read_table(file_path, file_type)
             transformed_df = self._transform_in_memory(
                 df,
                 return_dict=False,
                 persist=False,
                 save_format=None,
+                output_path=None,
             )
             assert isinstance(transformed_df, pd.DataFrame), "Expected DataFrame when return_dict=False"
             source_path = Path(file_path)
-            target_file = (
-                target_file_override
-                if target_file_override is not None
-                else output_root / f"{source_path.stem}_preprocessed{source_path.suffix}"
-            )
-            if file_type == "csv":
+            target_file = output_root / f"{source_path.stem}.{target_format}"
+            if target_format == "csv":
                 transformed_df.to_csv(target_file, index=False)
             else:
                 transformed_df.to_parquet(target_file, index=False)
             saved_paths.append(str(target_file.resolve()))
-        logger.info(colorize(
-            f"Transformed {len(saved_paths)} file(s) saved to: {output_root.resolve()}",
-            color="green",
-        ))
+        logger.info(colorize(f"Transformed {len(saved_paths)} file(s) saved to: {output_root.resolve()}", color="green",))
         return saved_paths
     # fit is nothing but registering the statistics from data so that we can transform the data later
-    def fit(
-        self,
-        data: Union[pd.DataFrame, Dict[str, Any], str, os.PathLike],
-        chunk_size: int = 200000,
-    ):
+    def fit(self, data: Union[pd.DataFrame, Dict[str, Any], str, os.PathLike],chunk_size: int = 200000,):
         logger = logging.getLogger()
         if isinstance(data, (str, os.PathLike)):
             path_str = str(data)
             uses_robust = any(cfg.get("scaler") == "robust" for cfg in self.numeric_features.values())
             if uses_robust:
-                logger.warning(
-                    "Robust scaler requires full data; loading all files into memory. "
-                    "Consider smaller chunk_size or different scaler if memory is limited."
-                )
+                logger.warning("Robust scaler requires full data; loading all files into memory. Consider smaller chunk_size or different scaler if memory is limited.")
                 data = self._load_dataframe_from_path(path_str)
             else:
                 return self._fit_from_path(path_str, chunk_size)
         if isinstance(data, dict):
             data = pd.DataFrame(data)
         logger.info(colorize("Fitting DataProcessor...", color="cyan", bold=True))
         for name, config in self.numeric_features.items():
             if name not in data.columns:
                 logger.warning(f"Numeric feature {name} not found in data")
                 continue
             self._process_numeric_feature_fit(data[name], config)
         for name, config in self.sparse_features.items():
             if name not in data.columns:
                 logger.warning(f"Sparse feature {name} not found in data")
                 continue
             self._process_sparse_feature_fit(data[name], config)
         for name, config in self.sequence_features.items():
             if name not in data.columns:
                 logger.warning(f"Sequence feature {name} not found in data")
                 continue
             self._process_sequence_feature_fit(data[name], config)
         for name, config in self.target_features.items():
             if name not in data.columns:
                 logger.warning(f"Target {name} not found in data")
                 continue
             self._process_target_fit(data[name], config)
         self.is_fitted = True
-        logger.info(colorize("DataProcessor fitted successfully", color="green", bold=True))
         return self
     def transform(
         self,
         data: Union[pd.DataFrame, Dict[str, Any], str, os.PathLike],
         return_dict: bool = True,
-        persist: bool = False,
         save_format: Optional[Literal["csv", "parquet"]] = None,
         output_path: Optional[str] = None,
     ) -> Union[pd.DataFrame, Dict[str, np.ndarray], list[str]]:
-        logger = logging.getLogger()
         if not self.is_fitted:
             raise ValueError("DataProcessor must be fitted before transform")
         if isinstance(data, (str, os.PathLike)):
-            if return_dict or persist or save_format is not None:
-                raise ValueError("Path transform writes files only; use output_path and leave return_dict/persist/save_format defaults.")
-            return self._transform_path(str(data), output_path)
+            if return_dict:
+                raise ValueError("Path transform writes files only; set return_dict=False when passing a path.")
+            return self._transform_path(str(data), output_path, save_format)
         return self._transform_in_memory(
             data=data,
             return_dict=return_dict,
-            persist=persist,
+            persist=output_path is not None,
             save_format=save_format,
+            output_path=output_path,
         )
     def fit_transform(
@@ -872,21 +758,20 @@ class DataProcessor(FeatureSpecMixin):
             save_format=save_format,
             output_path=output_path,
         )
-    def save(self, save_path: str):
-        logger = logging.getLogger()
+    def save(self, save_path: str | Path):
+        logger = logging.getLogger()
+        assert isinstance(save_path, (str, Path)), "save_path must be a string or Path"
+        save_path = Path(save_path)
         if not self.is_fitted:
             logger.warning("Saving unfitted DataProcessor")
         target_path = resolve_save_path(
             path=save_path,
-            default_dir=self.session.processor_dir,
-            default_name="processor",
+            default_dir=Path(os.getcwd()),
+            default_name="fitted_processor",
             suffix=".pkl",
+            add_timestamp=False
         )
-        # Prepare state dict
         state = {
             "numeric_features": self.numeric_features,
             "sparse_features": self.sparse_features,
@@ -896,43 +781,37 @@ class DataProcessor(FeatureSpecMixin):
             "scalers": self.scalers,
             "label_encoders": self.label_encoders,
             "target_encoders": self.target_encoders,
+            "processor_version": __version__,
         }
-        # Save with pickle
         with open(target_path, "wb") as f:
             pickle.dump(state, f)
-        logger.info(colorize(f"DataProcessor saved to: {target_path}", color="green"))
+        logger.info(f"DataProcessor saved to: {target_path}, NextRec version: {self.version}")
     @classmethod
-    def load(cls, load_path: str) -> 'DataProcessor':
+    def load(cls, load_path: str | Path) -> 'DataProcessor':
         logger = logging.getLogger()
+        load_path = Path(load_path)
         with open(load_path, 'rb') as f:
             state = pickle.load(f)
         processor = cls()
-        processor.numeric_features = state['numeric_features']
-        processor.sparse_features = state['sparse_features']
-        processor.sequence_features = state['sequence_features']
-        processor.target_features = state['target_features']
-        processor.is_fitted = state['is_fitted']
-        processor.scalers = state['scalers']
-        processor.label_encoders = state['label_encoders']
-        processor.target_encoders = state['target_encoders']
-        logger.info(f"DataProcessor loaded from {load_path}")
+        processor.numeric_features = state.get('numeric_features', {})
+        processor.sparse_features = state.get('sparse_features', {})
+        processor.sequence_features = state.get('sequence_features', {})
+        processor.target_features = state.get('target_features', {})
+        processor.is_fitted = state.get('is_fitted', False)
+        processor.scalers = state.get('scalers', {})
+        processor.label_encoders = state.get('label_encoders', {})
+        processor.target_encoders = state.get('target_encoders', {})
+        processor.version = state.get("processor_version", "unknown")
+        logger.info(f"DataProcessor loaded from {load_path}, NextRec version: {processor.version}")
         return processor
     def get_vocab_sizes(self) -> Dict[str, int]:
         vocab_sizes = {}
         for name, config in self.sparse_features.items():
             vocab_sizes[name] = config.get('vocab_size', 0)
         for name, config in self.sequence_features.items():
             vocab_sizes[name] = config.get('vocab_size', 0)
         return vocab_sizes
     def summary(self):

nextrec 0.2.4__py3-none-any.whl → 0.2.6__py3-none-any.whl

nextrec 0.2.4py3-none-any.whl → 0.2.6py3-none-any.whl