nextrec 0.2.4__py3-none-any.whl → 0.2.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -30,8 +30,10 @@ from nextrec.data.data_utils import (
30
30
  load_dataframes,
31
31
  default_output_dir,
32
32
  )
33
- from nextrec.basic.session import create_session, resolve_save_path
33
+ from nextrec.basic.session import resolve_save_path
34
34
  from nextrec.basic.features import FeatureSpecMixin
35
+ from nextrec.__version__ import __version__
36
+
35
37
 
36
38
  class DataProcessor(FeatureSpecMixin):
37
39
  """DataProcessor for data preprocessing including numeric, sparse, sequence features and target processing.
@@ -54,28 +56,21 @@ class DataProcessor(FeatureSpecMixin):
54
56
  >>> # Get vocabulary sizes for embedding layers
55
57
  >>> vocab_sizes = processor.get_vocab_sizes()
56
58
  """
57
- def __init__(self, session_id: str | None = None ):
59
+ def __init__(self):
58
60
  self.numeric_features: Dict[str, Dict[str, Any]] = {}
59
61
  self.sparse_features: Dict[str, Dict[str, Any]] = {}
60
62
  self.sequence_features: Dict[str, Dict[str, Any]] = {}
61
63
  self.target_features: Dict[str, Dict[str, Any]] = {}
62
- self.session_id = session_id
63
- self.session = create_session(session_id)
64
-
64
+ self.version = __version__
65
+
65
66
  self.is_fitted = False
66
67
  self._transform_summary_printed = False # Track if summary has been printed during transform
67
68
 
68
69
  self.scalers: Dict[str, Any] = {}
69
70
  self.label_encoders: Dict[str, LabelEncoder] = {}
70
71
  self.target_encoders: Dict[str, Dict[str, int]] = {}
71
- self._set_target_config([], [])
72
-
73
- # Initialize logger if not already initialized
74
- self._logger_initialized = False
75
- if not logging.getLogger().hasHandlers():
76
- setup_logger(session_id=self.session_id)
77
- self._logger_initialized = True
78
-
72
+ self._set_target_id_config([], [])
73
+
79
74
  def add_numeric_feature(
80
75
  self,
81
76
  name: str,
@@ -96,7 +91,6 @@ class DataProcessor(FeatureSpecMixin):
96
91
  ):
97
92
  if encode_method == 'hash' and hash_size is None:
98
93
  raise ValueError("hash_size must be specified when encode_method='hash'")
99
-
100
94
  self.sparse_features[name] = {
101
95
  'encode_method': encode_method,
102
96
  'hash_size': hash_size,
@@ -113,10 +107,8 @@ class DataProcessor(FeatureSpecMixin):
113
107
  truncate: Literal['pre', 'post'] = 'pre', # pre: keep last max_len items, post: keep first max_len items
114
108
  separator: str = ','
115
109
  ):
116
-
117
110
  if encode_method == 'hash' and hash_size is None:
118
111
  raise ValueError("hash_size must be specified when encode_method='hash'")
119
-
120
112
  self.sequence_features[name] = {
121
113
  'encode_method': encode_method,
122
114
  'hash_size': hash_size,
@@ -136,23 +128,20 @@ class DataProcessor(FeatureSpecMixin):
136
128
  'target_type': target_type,
137
129
  'label_map': label_map
138
130
  }
139
- self._set_target_config(list(self.target_features.keys()), [])
131
+ self._set_target_id_config(list(self.target_features.keys()), [])
140
132
 
141
133
  def _hash_string(self, s: str, hash_size: int) -> int:
142
134
  return int(hashlib.md5(str(s).encode()).hexdigest(), 16) % hash_size
143
135
 
144
136
  def _process_numeric_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
145
-
146
137
  name = str(data.name)
147
138
  scaler_type = config['scaler']
148
139
  fill_na = config['fill_na']
149
-
150
140
  if data.isna().any():
151
141
  if fill_na is None:
152
142
  # Default use mean value to fill missing values for numeric features
153
143
  fill_na = data.mean()
154
144
  config['fill_na_value'] = fill_na
155
-
156
145
  if scaler_type == 'standard':
157
146
  scaler = StandardScaler()
158
147
  elif scaler_type == 'minmax':
@@ -167,27 +156,19 @@ class DataProcessor(FeatureSpecMixin):
167
156
  scaler = None
168
157
  else:
169
158
  raise ValueError(f"Unknown scaler type: {scaler_type}")
170
-
171
159
  if scaler is not None and scaler_type != 'log':
172
160
  filled_data = data.fillna(config.get('fill_na_value', 0))
173
161
  values = np.array(filled_data.values, dtype=np.float64).reshape(-1, 1)
174
162
  scaler.fit(values)
175
163
  self.scalers[name] = scaler
176
164
 
177
- def _process_numeric_feature_transform(
178
- self,
179
- data: pd.Series,
180
- config: Dict[str, Any]
181
- ) -> np.ndarray:
165
+ def _process_numeric_feature_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
182
166
  logger = logging.getLogger()
183
-
184
167
  name = str(data.name)
185
168
  scaler_type = config['scaler']
186
169
  fill_na_value = config.get('fill_na_value', 0)
187
-
188
170
  filled_data = data.fillna(fill_na_value)
189
171
  values = np.array(filled_data.values, dtype=np.float64)
190
-
191
172
  if scaler_type == 'log':
192
173
  result = np.log1p(np.maximum(values, 0))
193
174
  elif scaler_type == 'none':
@@ -199,17 +180,13 @@ class DataProcessor(FeatureSpecMixin):
199
180
  result = values
200
181
  else:
201
182
  result = scaler.transform(values.reshape(-1, 1)).ravel()
202
-
203
183
  return result
204
184
 
205
185
  def _process_sparse_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
206
-
207
186
  name = str(data.name)
208
187
  encode_method = config['encode_method']
209
188
  fill_na = config['fill_na'] # <UNK>
210
-
211
189
  filled_data = data.fillna(fill_na).astype(str)
212
-
213
190
  if encode_method == 'label':
214
191
  le = LabelEncoder()
215
192
  le.fit(filled_data)
@@ -218,49 +195,32 @@ class DataProcessor(FeatureSpecMixin):
218
195
  elif encode_method == 'hash':
219
196
  config['vocab_size'] = config['hash_size']
220
197
 
221
- def _process_sparse_feature_transform(
222
- self,
223
- data: pd.Series,
224
- config: Dict[str, Any]
225
- ) -> np.ndarray:
226
- """Fast path sparse feature transform using cached dict mapping or hashing."""
198
+ def _process_sparse_feature_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
227
199
  name = str(data.name)
228
200
  encode_method = config['encode_method']
229
201
  fill_na = config['fill_na']
230
-
231
202
  sparse_series = pd.Series(data, name=name).fillna(fill_na).astype(str)
232
-
233
203
  if encode_method == 'label':
234
204
  le = self.label_encoders.get(name)
235
205
  if le is None:
236
206
  raise ValueError(f"LabelEncoder for {name} not fitted")
237
-
238
207
  class_to_idx = config.get('_class_to_idx')
239
208
  if class_to_idx is None:
240
209
  class_to_idx = {cls: idx for idx, cls in enumerate(le.classes_)}
241
210
  config['_class_to_idx'] = class_to_idx
242
-
243
211
  encoded = sparse_series.map(class_to_idx)
244
212
  encoded = encoded.fillna(0).astype(np.int64)
245
213
  return encoded.to_numpy()
246
-
247
214
  if encode_method == 'hash':
248
215
  hash_size = config['hash_size']
249
216
  hash_fn = self._hash_string
250
- return np.fromiter(
251
- (hash_fn(v, hash_size) for v in sparse_series.to_numpy()),
252
- dtype=np.int64,
253
- count=sparse_series.size,
254
- )
255
-
217
+ return np.fromiter((hash_fn(v, hash_size) for v in sparse_series.to_numpy()), dtype=np.int64, count=sparse_series.size,)
256
218
  return np.array([], dtype=np.int64)
257
219
 
258
220
  def _process_sequence_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
259
-
260
221
  name = str(data.name)
261
222
  encode_method = config['encode_method']
262
223
  separator = config['separator']
263
-
264
224
  if encode_method == 'label':
265
225
  all_tokens = set()
266
226
  for seq in data:
@@ -280,12 +240,9 @@ class DataProcessor(FeatureSpecMixin):
280
240
  tokens = [str(t) for t in seq.tolist()]
281
241
  else:
282
242
  continue
283
-
284
243
  all_tokens.update(tokens)
285
-
286
244
  if len(all_tokens) == 0:
287
245
  all_tokens.add('<PAD>')
288
-
289
246
  le = LabelEncoder()
290
247
  le.fit(list(all_tokens))
291
248
  self.label_encoders[name] = le
@@ -293,11 +250,7 @@ class DataProcessor(FeatureSpecMixin):
293
250
  elif encode_method == 'hash':
294
251
  config['vocab_size'] = config['hash_size']
295
252
 
296
- def _process_sequence_feature_transform(
297
- self,
298
- data: pd.Series,
299
- config: Dict[str, Any]
300
- ) -> np.ndarray:
253
+ def _process_sequence_feature_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
301
254
  """Optimized sequence transform with preallocation and cached vocab map."""
302
255
  name = str(data.name)
303
256
  encode_method = config['encode_method']
@@ -305,15 +258,12 @@ class DataProcessor(FeatureSpecMixin):
305
258
  pad_value = config['pad_value']
306
259
  truncate = config['truncate']
307
260
  separator = config['separator']
308
-
309
261
  arr = np.asarray(data, dtype=object)
310
262
  n = arr.shape[0]
311
263
  output = np.full((n, max_len), pad_value, dtype=np.int64)
312
-
313
264
  # Shared helpers cached locally for speed and cross-platform consistency
314
265
  split_fn = str.split
315
266
  is_nan = np.isnan
316
-
317
267
  if encode_method == 'label':
318
268
  le = self.label_encoders.get(name)
319
269
  if le is None:
@@ -324,10 +274,8 @@ class DataProcessor(FeatureSpecMixin):
324
274
  config['_class_to_idx'] = class_to_idx
325
275
  else:
326
276
  class_to_idx = None # type: ignore
327
-
328
277
  hash_fn = self._hash_string
329
278
  hash_size = config.get('hash_size')
330
-
331
279
  for i, seq in enumerate(arr):
332
280
  # normalize sequence to a list of strings
333
281
  tokens = []
@@ -342,14 +290,12 @@ class DataProcessor(FeatureSpecMixin):
342
290
  tokens = [str(t) for t in seq]
343
291
  else:
344
292
  tokens = []
345
-
346
293
  if encode_method == 'label':
347
294
  encoded = [
348
295
  class_to_idx.get(token.strip(), 0) # type: ignore[union-attr]
349
296
  for token in tokens
350
297
  if token is not None and token != ''
351
298
  ]
352
-
353
299
  elif encode_method == 'hash':
354
300
  if hash_size is None:
355
301
  raise ValueError("hash_size must be set for hash encoding")
@@ -360,27 +306,21 @@ class DataProcessor(FeatureSpecMixin):
360
306
  ]
361
307
  else:
362
308
  encoded = []
363
-
364
309
  if not encoded:
365
310
  continue
366
-
367
311
  if len(encoded) > max_len:
368
312
  encoded = encoded[-max_len:] if truncate == 'pre' else encoded[:max_len]
369
-
370
313
  output[i, : len(encoded)] = encoded
371
-
372
314
  return output
373
315
 
374
316
  def _process_target_fit(self, data: pd.Series, config: Dict[str, Any]):
375
317
  name = str(data.name)
376
318
  target_type = config['target_type']
377
- label_map = config['label_map']
378
-
319
+ label_map = config.get('label_map')
379
320
  if target_type in ['binary', 'multiclass']:
380
321
  if label_map is None:
381
322
  unique_values = data.dropna().unique()
382
323
  sorted_values = sorted(unique_values)
383
-
384
324
  try:
385
325
  int_values = [int(v) for v in sorted_values]
386
326
  if int_values == list(range(len(int_values))):
@@ -389,29 +329,20 @@ class DataProcessor(FeatureSpecMixin):
389
329
  label_map = {str(val): idx for idx, val in enumerate(sorted_values)}
390
330
  except (ValueError, TypeError):
391
331
  label_map = {str(val): idx for idx, val in enumerate(sorted_values)}
392
-
393
332
  config['label_map'] = label_map
394
-
395
333
  self.target_encoders[name] = label_map
396
334
 
397
- def _process_target_transform(
398
- self,
399
- data: pd.Series,
400
- config: Dict[str, Any]
401
- ) -> np.ndarray:
335
+ def _process_target_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
402
336
  logger = logging.getLogger()
403
-
404
337
  name = str(data.name)
405
- target_type = config['target_type']
406
-
338
+ target_type = config.get('target_type')
407
339
  if target_type == 'regression':
408
340
  values = np.array(data.values, dtype=np.float32)
409
341
  return values
410
342
  else:
411
343
  label_map = self.target_encoders.get(name)
412
344
  if label_map is None:
413
- raise ValueError(f"Target encoder for {name} not fitted")
414
-
345
+ raise ValueError(f"Target encoder for {name} not fitted")
415
346
  result = []
416
347
  for val in data:
417
348
  str_val = str(val)
@@ -420,7 +351,6 @@ class DataProcessor(FeatureSpecMixin):
420
351
  else:
421
352
  logger.warning(f"Unknown target value: {val}, mapping to 0")
422
353
  result.append(0)
423
-
424
354
  return np.array(result, dtype=np.int64 if target_type == 'multiclass' else np.float32)
425
355
 
426
356
  def _load_dataframe_from_path(self, path: str) -> pd.DataFrame:
@@ -458,13 +388,10 @@ class DataProcessor(FeatureSpecMixin):
458
388
  "max": -np.inf,
459
389
  "max_abs": 0.0,
460
390
  }
461
-
462
391
  sparse_vocab: Dict[str, set[str]] = {name: set() for name in self.sparse_features.keys()}
463
392
  seq_vocab: Dict[str, set[str]] = {name: set() for name in self.sequence_features.keys()}
464
393
  target_values: Dict[str, set[Any]] = {name: set() for name in self.target_features.keys()}
465
-
466
394
  missing_features = set()
467
-
468
395
  for file_path in file_paths:
469
396
  for chunk in iter_file_chunks(file_path, file_type, chunk_size):
470
397
  # numeric features
@@ -514,25 +441,19 @@ class DataProcessor(FeatureSpecMixin):
514
441
  continue
515
442
  vals = chunk[name].dropna().tolist()
516
443
  target_values[name].update(vals)
517
-
518
444
  if missing_features:
519
- logger.warning(
520
- f"The following configured features were not found in provided files: {sorted(missing_features)}"
521
- )
522
-
445
+ logger.warning(f"The following configured features were not found in provided files: {sorted(missing_features)}")
523
446
  # finalize numeric scalers
524
447
  for name, config in self.numeric_features.items():
525
448
  acc = numeric_acc[name]
526
449
  if acc["count"] == 0:
527
450
  logger.warning(f"Numeric feature {name} has no valid values in provided files")
528
451
  continue
529
-
530
452
  mean_val = acc["sum"] / acc["count"]
531
453
  if config["fill_na"] is not None:
532
454
  config["fill_na_value"] = config["fill_na"]
533
455
  else:
534
456
  config["fill_na_value"] = mean_val
535
-
536
457
  scaler_type = config["scaler"]
537
458
  if scaler_type == "standard":
538
459
  var = max(acc["sumsq"] / acc["count"] - mean_val * mean_val, 0.0)
@@ -550,6 +471,11 @@ class DataProcessor(FeatureSpecMixin):
550
471
  scaler.data_max_ = np.array([data_max], dtype=np.float64)
551
472
  scaler.data_range_ = scaler.data_max_ - scaler.data_min_
552
473
  scaler.data_range_[scaler.data_range_ == 0] = 1.0
474
+ # Manually set scale_/min_ for streaming fit to mirror sklearn's internal fit logic
475
+ feature_min, feature_max = scaler.feature_range
476
+ scale = (feature_max - feature_min) / scaler.data_range_
477
+ scaler.scale_ = scale
478
+ scaler.min_ = feature_min - scaler.data_min_ * scale
553
479
  scaler.n_samples_seen_ = np.array([int(acc["count"])], dtype=np.int64)
554
480
  self.scalers[name] = scaler
555
481
  elif scaler_type == "maxabs":
@@ -626,9 +552,9 @@ class DataProcessor(FeatureSpecMixin):
626
552
  return_dict: bool,
627
553
  persist: bool,
628
554
  save_format: Optional[Literal["csv", "parquet"]],
555
+ output_path: Optional[str],
629
556
  ) -> Union[pd.DataFrame, Dict[str, np.ndarray]]:
630
- logger = logging.getLogger()
631
-
557
+ logger = logging.getLogger()
632
558
  # Convert input to dict format for unified processing
633
559
  if isinstance(data, pd.DataFrame):
634
560
  data_dict = {col: data[col] for col in data.columns}
@@ -688,173 +614,133 @@ class DataProcessor(FeatureSpecMixin):
688
614
  columns_dict = {}
689
615
  for key, value in result.items():
690
616
  if key in self.sequence_features:
691
- columns_dict[key] = [list(seq) for seq in value]
617
+ # Use tolist to coerce numpy scalars to native Python ints for stable CSV rendering
618
+ columns_dict[key] = [np.asarray(seq).tolist() for seq in value]
692
619
  else:
693
620
  columns_dict[key] = value
694
621
  return pd.DataFrame(columns_dict)
695
-
696
- assert save_format in [None, "csv", "parquet"], "save_format must be either 'csv', 'parquet', or None"
697
- if persist and save_format is None:
698
- save_format = "parquet"
699
-
622
+
623
+ if save_format not in [None, "csv", "parquet"]:
624
+ raise ValueError("save_format must be either 'csv', 'parquet', or None")
625
+ effective_format = save_format
626
+ if persist:
627
+ effective_format = save_format or "parquet"
700
628
  result_df = None
701
- if (not return_dict) or (save_format is not None):
629
+ if (not return_dict) or persist:
702
630
  result_df = _dict_to_dataframe(result_dict)
703
- assert result_df is not None, "DataFrame is None after transform"
704
-
705
- if save_format is not None:
706
- save_path = resolve_save_path(
707
- path=None,
708
- default_dir=self.session_dir / "processor" / "preprocessed_data",
709
- default_name="data_processed",
710
- suffix=f".{save_format}",
711
- add_timestamp=True,
712
- )
713
-
714
- if save_format == "parquet":
631
+ if persist:
632
+ if output_path is None:
633
+ raise ValueError("output_path must be provided when persisting transformed data.")
634
+ output_dir = Path(output_path)
635
+ if output_dir.suffix:
636
+ output_dir = output_dir.parent
637
+ output_dir.mkdir(parents=True, exist_ok=True)
638
+ save_path = output_dir / f"transformed_data.{effective_format}"
639
+ assert result_df is not None, "DataFrame conversion failed"
640
+ if effective_format == "parquet":
715
641
  result_df.to_parquet(save_path, index=False)
716
642
  else:
717
643
  result_df.to_csv(save_path, index=False)
718
-
719
- logger.info(colorize(
720
- f"Transformed data saved to: {save_path}",
721
- color="green"
722
- ))
723
-
644
+ logger.info(colorize(f"Transformed data saved to: {save_path.resolve()}", color="green"))
724
645
  if return_dict:
725
646
  return result_dict
647
+ assert result_df is not None, "DataFrame is None after transform"
726
648
  return result_df
727
649
 
728
- def _transform_path(self, path: str, output_path: Optional[str]) -> list[str]:
650
+ def _transform_path(
651
+ self,
652
+ input_path: str,
653
+ output_path: Optional[str],
654
+ save_format: Optional[Literal["csv", "parquet"]],
655
+ ) -> list[str]:
729
656
  """Transform data from files under a path and save them to a new location."""
730
657
  logger = logging.getLogger()
731
-
732
- file_paths, file_type = resolve_file_paths(path)
733
- default_root = self.session_dir / "processor" / default_output_dir(path).name
734
- output_root = default_root
735
- target_file_override: Optional[Path] = None
736
-
737
- if output_path:
738
- output_path_obj = Path(output_path)
739
- if not output_path_obj.is_absolute():
740
- output_path_obj = self.session_dir / output_path_obj
741
- if output_path_obj.suffix.lower() in {".csv", ".parquet"}:
742
- if len(file_paths) != 1:
743
- raise ValueError("output_path points to a file but multiple input files were provided.")
744
- target_file_override = output_path_obj
745
- output_root = output_path_obj.parent
746
- else:
747
- output_root = output_path_obj
748
-
658
+ file_paths, file_type = resolve_file_paths(input_path)
659
+ target_format = save_format or file_type
660
+ if target_format not in ["csv", "parquet"]:
661
+ raise ValueError("save_format must be either 'csv' or 'parquet'")
662
+ base_output_dir = Path(output_path) if output_path else default_output_dir(input_path)
663
+ if base_output_dir.suffix:
664
+ base_output_dir = base_output_dir.parent
665
+ output_root = base_output_dir / "transformed_data"
749
666
  output_root.mkdir(parents=True, exist_ok=True)
750
-
751
- saved_paths: list[str] = []
667
+ saved_paths = []
752
668
  for file_path in file_paths:
753
669
  df = read_table(file_path, file_type)
754
-
755
670
  transformed_df = self._transform_in_memory(
756
671
  df,
757
672
  return_dict=False,
758
673
  persist=False,
759
674
  save_format=None,
675
+ output_path=None,
760
676
  )
761
677
  assert isinstance(transformed_df, pd.DataFrame), "Expected DataFrame when return_dict=False"
762
-
763
678
  source_path = Path(file_path)
764
- target_file = (
765
- target_file_override
766
- if target_file_override is not None
767
- else output_root / f"{source_path.stem}_preprocessed{source_path.suffix}"
768
- )
769
-
770
- if file_type == "csv":
679
+ target_file = output_root / f"{source_path.stem}.{target_format}"
680
+ if target_format == "csv":
771
681
  transformed_df.to_csv(target_file, index=False)
772
682
  else:
773
683
  transformed_df.to_parquet(target_file, index=False)
774
-
775
684
  saved_paths.append(str(target_file.resolve()))
776
-
777
- logger.info(colorize(
778
- f"Transformed {len(saved_paths)} file(s) saved to: {output_root.resolve()}",
779
- color="green",
780
- ))
685
+ logger.info(colorize(f"Transformed {len(saved_paths)} file(s) saved to: {output_root.resolve()}", color="green",))
781
686
  return saved_paths
782
687
 
783
688
  # fit is nothing but registering the statistics from data so that we can transform the data later
784
- def fit(
785
- self,
786
- data: Union[pd.DataFrame, Dict[str, Any], str, os.PathLike],
787
- chunk_size: int = 200000,
788
- ):
689
+ def fit(self, data: Union[pd.DataFrame, Dict[str, Any], str, os.PathLike],chunk_size: int = 200000,):
789
690
  logger = logging.getLogger()
790
-
791
691
  if isinstance(data, (str, os.PathLike)):
792
692
  path_str = str(data)
793
693
  uses_robust = any(cfg.get("scaler") == "robust" for cfg in self.numeric_features.values())
794
694
  if uses_robust:
795
- logger.warning(
796
- "Robust scaler requires full data; loading all files into memory. "
797
- "Consider smaller chunk_size or different scaler if memory is limited."
798
- )
695
+ logger.warning("Robust scaler requires full data; loading all files into memory. Consider smaller chunk_size or different scaler if memory is limited.")
799
696
  data = self._load_dataframe_from_path(path_str)
800
697
  else:
801
698
  return self._fit_from_path(path_str, chunk_size)
802
699
  if isinstance(data, dict):
803
700
  data = pd.DataFrame(data)
804
-
805
701
  logger.info(colorize("Fitting DataProcessor...", color="cyan", bold=True))
806
-
807
702
  for name, config in self.numeric_features.items():
808
703
  if name not in data.columns:
809
704
  logger.warning(f"Numeric feature {name} not found in data")
810
705
  continue
811
706
  self._process_numeric_feature_fit(data[name], config)
812
-
813
707
  for name, config in self.sparse_features.items():
814
708
  if name not in data.columns:
815
709
  logger.warning(f"Sparse feature {name} not found in data")
816
710
  continue
817
711
  self._process_sparse_feature_fit(data[name], config)
818
-
819
712
  for name, config in self.sequence_features.items():
820
713
  if name not in data.columns:
821
714
  logger.warning(f"Sequence feature {name} not found in data")
822
715
  continue
823
716
  self._process_sequence_feature_fit(data[name], config)
824
-
825
717
  for name, config in self.target_features.items():
826
718
  if name not in data.columns:
827
719
  logger.warning(f"Target {name} not found in data")
828
720
  continue
829
721
  self._process_target_fit(data[name], config)
830
-
831
722
  self.is_fitted = True
832
- logger.info(colorize("DataProcessor fitted successfully", color="green", bold=True))
833
723
  return self
834
724
 
835
725
  def transform(
836
726
  self,
837
727
  data: Union[pd.DataFrame, Dict[str, Any], str, os.PathLike],
838
728
  return_dict: bool = True,
839
- persist: bool = False,
840
729
  save_format: Optional[Literal["csv", "parquet"]] = None,
841
730
  output_path: Optional[str] = None,
842
731
  ) -> Union[pd.DataFrame, Dict[str, np.ndarray], list[str]]:
843
- logger = logging.getLogger()
844
-
845
732
  if not self.is_fitted:
846
733
  raise ValueError("DataProcessor must be fitted before transform")
847
-
848
734
  if isinstance(data, (str, os.PathLike)):
849
- if return_dict or persist or save_format is not None:
850
- raise ValueError("Path transform writes files only; use output_path and leave return_dict/persist/save_format defaults.")
851
- return self._transform_path(str(data), output_path)
852
-
735
+ if return_dict:
736
+ raise ValueError("Path transform writes files only; set return_dict=False when passing a path.")
737
+ return self._transform_path(str(data), output_path, save_format)
853
738
  return self._transform_in_memory(
854
739
  data=data,
855
740
  return_dict=return_dict,
856
- persist=persist,
741
+ persist=output_path is not None,
857
742
  save_format=save_format,
743
+ output_path=output_path,
858
744
  )
859
745
 
860
746
  def fit_transform(
@@ -872,21 +758,20 @@ class DataProcessor(FeatureSpecMixin):
872
758
  save_format=save_format,
873
759
  output_path=output_path,
874
760
  )
875
-
876
- def save(self, save_path: str):
877
- logger = logging.getLogger()
878
761
 
762
+ def save(self, save_path: str | Path):
763
+ logger = logging.getLogger()
764
+ assert isinstance(save_path, (str, Path)), "save_path must be a string or Path"
765
+ save_path = Path(save_path)
879
766
  if not self.is_fitted:
880
767
  logger.warning("Saving unfitted DataProcessor")
881
-
882
768
  target_path = resolve_save_path(
883
769
  path=save_path,
884
- default_dir=self.session.processor_dir,
885
- default_name="processor",
770
+ default_dir=Path(os.getcwd()),
771
+ default_name="fitted_processor",
886
772
  suffix=".pkl",
773
+ add_timestamp=False
887
774
  )
888
-
889
- # Prepare state dict
890
775
  state = {
891
776
  "numeric_features": self.numeric_features,
892
777
  "sparse_features": self.sparse_features,
@@ -896,43 +781,37 @@ class DataProcessor(FeatureSpecMixin):
896
781
  "scalers": self.scalers,
897
782
  "label_encoders": self.label_encoders,
898
783
  "target_encoders": self.target_encoders,
784
+ "processor_version": __version__,
899
785
  }
900
-
901
- # Save with pickle
902
786
  with open(target_path, "wb") as f:
903
787
  pickle.dump(state, f)
904
-
905
- logger.info(colorize(f"DataProcessor saved to: {target_path}", color="green"))
788
+ logger.info(f"DataProcessor saved to: {target_path}, NextRec version: {self.version}")
906
789
 
907
790
  @classmethod
908
- def load(cls, load_path: str) -> 'DataProcessor':
791
+ def load(cls, load_path: str | Path) -> 'DataProcessor':
909
792
  logger = logging.getLogger()
910
-
793
+ load_path = Path(load_path)
911
794
  with open(load_path, 'rb') as f:
912
795
  state = pickle.load(f)
913
-
914
796
  processor = cls()
915
- processor.numeric_features = state['numeric_features']
916
- processor.sparse_features = state['sparse_features']
917
- processor.sequence_features = state['sequence_features']
918
- processor.target_features = state['target_features']
919
- processor.is_fitted = state['is_fitted']
920
- processor.scalers = state['scalers']
921
- processor.label_encoders = state['label_encoders']
922
- processor.target_encoders = state['target_encoders']
923
-
924
- logger.info(f"DataProcessor loaded from {load_path}")
797
+ processor.numeric_features = state.get('numeric_features', {})
798
+ processor.sparse_features = state.get('sparse_features', {})
799
+ processor.sequence_features = state.get('sequence_features', {})
800
+ processor.target_features = state.get('target_features', {})
801
+ processor.is_fitted = state.get('is_fitted', False)
802
+ processor.scalers = state.get('scalers', {})
803
+ processor.label_encoders = state.get('label_encoders', {})
804
+ processor.target_encoders = state.get('target_encoders', {})
805
+ processor.version = state.get("processor_version", "unknown")
806
+ logger.info(f"DataProcessor loaded from {load_path}, NextRec version: {processor.version}")
925
807
  return processor
926
808
 
927
809
  def get_vocab_sizes(self) -> Dict[str, int]:
928
810
  vocab_sizes = {}
929
-
930
811
  for name, config in self.sparse_features.items():
931
812
  vocab_sizes[name] = config.get('vocab_size', 0)
932
-
933
813
  for name, config in self.sequence_features.items():
934
814
  vocab_sizes[name] = config.get('vocab_size', 0)
935
-
936
815
  return vocab_sizes
937
816
 
938
817
  def summary(self):