nextrec 0.4.7__py3-none-any.whl → 0.4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. nextrec/__version__.py +1 -1
  2. nextrec/basic/callback.py +30 -15
  3. nextrec/basic/features.py +1 -0
  4. nextrec/basic/layers.py +6 -8
  5. nextrec/basic/loggers.py +14 -7
  6. nextrec/basic/metrics.py +6 -76
  7. nextrec/basic/model.py +337 -328
  8. nextrec/cli.py +25 -4
  9. nextrec/data/__init__.py +13 -16
  10. nextrec/data/batch_utils.py +3 -2
  11. nextrec/data/data_processing.py +10 -2
  12. nextrec/data/data_utils.py +9 -14
  13. nextrec/data/dataloader.py +12 -13
  14. nextrec/data/preprocessor.py +328 -255
  15. nextrec/loss/__init__.py +1 -5
  16. nextrec/loss/loss_utils.py +2 -8
  17. nextrec/models/generative/__init__.py +1 -8
  18. nextrec/models/generative/hstu.py +6 -4
  19. nextrec/models/multi_task/esmm.py +2 -2
  20. nextrec/models/multi_task/mmoe.py +2 -2
  21. nextrec/models/multi_task/ple.py +2 -2
  22. nextrec/models/multi_task/poso.py +2 -3
  23. nextrec/models/multi_task/share_bottom.py +2 -2
  24. nextrec/models/ranking/afm.py +2 -2
  25. nextrec/models/ranking/autoint.py +2 -2
  26. nextrec/models/ranking/dcn.py +2 -2
  27. nextrec/models/ranking/dcn_v2.py +2 -2
  28. nextrec/models/ranking/deepfm.py +2 -2
  29. nextrec/models/ranking/dien.py +3 -3
  30. nextrec/models/ranking/din.py +3 -3
  31. nextrec/models/ranking/ffm.py +0 -0
  32. nextrec/models/ranking/fibinet.py +5 -5
  33. nextrec/models/ranking/fm.py +3 -7
  34. nextrec/models/ranking/lr.py +0 -0
  35. nextrec/models/ranking/masknet.py +2 -2
  36. nextrec/models/ranking/pnn.py +2 -2
  37. nextrec/models/ranking/widedeep.py +2 -2
  38. nextrec/models/ranking/xdeepfm.py +2 -2
  39. nextrec/models/representation/__init__.py +9 -0
  40. nextrec/models/{generative → representation}/rqvae.py +9 -9
  41. nextrec/models/retrieval/__init__.py +0 -0
  42. nextrec/models/{match → retrieval}/dssm.py +8 -3
  43. nextrec/models/{match → retrieval}/dssm_v2.py +8 -3
  44. nextrec/models/{match → retrieval}/mind.py +4 -3
  45. nextrec/models/{match → retrieval}/sdm.py +4 -3
  46. nextrec/models/{match → retrieval}/youtube_dnn.py +8 -3
  47. nextrec/utils/__init__.py +60 -46
  48. nextrec/utils/config.py +12 -10
  49. nextrec/utils/console.py +371 -0
  50. nextrec/utils/{synthetic_data.py → data.py} +102 -15
  51. nextrec/utils/feature.py +15 -0
  52. nextrec/utils/torch_utils.py +411 -0
  53. {nextrec-0.4.7.dist-info → nextrec-0.4.9.dist-info}/METADATA +8 -7
  54. nextrec-0.4.9.dist-info/RECORD +70 -0
  55. nextrec/utils/device.py +0 -78
  56. nextrec/utils/distributed.py +0 -141
  57. nextrec/utils/file.py +0 -92
  58. nextrec/utils/initializer.py +0 -79
  59. nextrec/utils/optimizer.py +0 -75
  60. nextrec/utils/tensor.py +0 -72
  61. nextrec-0.4.7.dist-info/RECORD +0 -70
  62. /nextrec/models/{match/__init__.py → ranking/eulernet.py} +0 -0
  63. {nextrec-0.4.7.dist-info → nextrec-0.4.9.dist-info}/WHEEL +0 -0
  64. {nextrec-0.4.7.dist-info → nextrec-0.4.9.dist-info}/entry_points.txt +0 -0
  65. {nextrec-0.4.7.dist-info → nextrec-0.4.9.dist-info}/licenses/LICENSE +0 -0
@@ -2,46 +2,48 @@
2
2
  DataProcessor for data preprocessing including numeric, sparse, sequence features and target processing.
3
3
 
4
4
  Date: create on 13/11/2025
5
- Checkpoint: edit on 02/12/2025
5
+ Checkpoint: edit on 19/12/2025
6
6
  Author: Yang Zhou, zyaztec@gmail.com
7
7
  """
8
8
 
9
9
  from __future__ import annotations
10
+
11
+ import functools
12
+ import logging
10
13
  import os
11
14
  import pickle
12
- import hashlib
13
- import logging
15
+ from pathlib import Path
16
+ from typing import Any, Dict, Literal, Optional, Union
17
+
14
18
  import numpy as np
15
19
  import pandas as pd
16
-
17
- import tqdm
18
- from pathlib import Path
19
- from typing import Dict, Union, Optional, Literal, Any
20
+ import pyarrow as pa
21
+ import pyarrow.parquet as pq
20
22
  from sklearn.preprocessing import (
21
- StandardScaler,
23
+ LabelEncoder,
24
+ MaxAbsScaler,
22
25
  MinMaxScaler,
23
26
  RobustScaler,
24
- MaxAbsScaler,
25
- LabelEncoder,
27
+ StandardScaler,
26
28
  )
27
29
 
28
-
30
+ from nextrec.__version__ import __version__
29
31
  from nextrec.basic.features import FeatureSet
30
32
  from nextrec.basic.loggers import colorize
31
33
  from nextrec.basic.session import resolve_save_path
32
- from nextrec.utils.file import (
33
- resolve_file_paths,
34
+ from nextrec.data.data_processing import hash_md5_mod
35
+ from nextrec.utils.console import progress
36
+ from nextrec.utils.data import (
37
+ default_output_dir,
34
38
  iter_file_chunks,
35
- read_table,
36
39
  load_dataframes,
37
- default_output_dir,
40
+ read_table,
41
+ resolve_file_paths,
38
42
  )
39
43
 
40
- from nextrec.__version__ import __version__
41
-
42
44
 
43
45
  class DataProcessor(FeatureSet):
44
- def __init__(self):
46
+ def __init__(self, hash_cache_size: int = 200_000):
45
47
  self.numeric_features: Dict[str, Dict[str, Any]] = {}
46
48
  self.sparse_features: Dict[str, Dict[str, Any]] = {}
47
49
  self.sequence_features: Dict[str, Dict[str, Any]] = {}
@@ -56,7 +58,16 @@ class DataProcessor(FeatureSet):
56
58
  self.scalers: Dict[str, Any] = {}
57
59
  self.label_encoders: Dict[str, LabelEncoder] = {}
58
60
  self.target_encoders: Dict[str, Dict[str, int]] = {}
59
- self.set_target_id([], [])
61
+ self.set_target_id(target=[], id_columns=[])
62
+
63
+ # cache hash function
64
+ self.hash_cache_size = int(hash_cache_size)
65
+ if self.hash_cache_size > 0:
66
+ self.hash_fn = functools.lru_cache(maxsize=self.hash_cache_size)(
67
+ hash_md5_mod
68
+ )
69
+ else:
70
+ self.hash_fn = hash_md5_mod
60
71
 
61
72
  def add_numeric_feature(
62
73
  self,
@@ -76,7 +87,9 @@ class DataProcessor(FeatureSet):
76
87
  fill_na: str = "<UNK>",
77
88
  ):
78
89
  if encode_method == "hash" and hash_size is None:
79
- raise ValueError("hash_size must be specified when encode_method='hash'")
90
+ raise ValueError(
91
+ "[Data Processor Error] hash_size must be specified when encode_method='hash'"
92
+ )
80
93
  self.sparse_features[name] = {
81
94
  "encode_method": encode_method,
82
95
  "hash_size": hash_size,
@@ -96,7 +109,9 @@ class DataProcessor(FeatureSet):
96
109
  separator: str = ",",
97
110
  ):
98
111
  if encode_method == "hash" and hash_size is None:
99
- raise ValueError("hash_size must be specified when encode_method='hash'")
112
+ raise ValueError(
113
+ "[Data Processor Error] hash_size must be specified when encode_method='hash'"
114
+ )
100
115
  self.sequence_features[name] = {
101
116
  "encode_method": encode_method,
102
117
  "hash_size": hash_size,
@@ -109,7 +124,7 @@ class DataProcessor(FeatureSet):
109
124
  def add_target(
110
125
  self,
111
126
  name: str, # example: 'click'
112
- target_type: Literal["binary", "multiclass", "regression"] = "binary",
127
+ target_type: Literal["binary", "regression"] = "binary",
113
128
  label_map: Optional[
114
129
  Dict[str, int]
115
130
  ] = None, # example: {'click': 1, 'no_click': 0}
@@ -121,7 +136,18 @@ class DataProcessor(FeatureSet):
121
136
  self.set_target_id(list(self.target_features.keys()), [])
122
137
 
123
138
  def hash_string(self, s: str, hash_size: int) -> int:
124
- return int(hashlib.md5(str(s).encode()).hexdigest(), 16) % hash_size
139
+ return self.hash_fn(str(s), int(hash_size))
140
+
141
+ def clear_hash_cache(self) -> None:
142
+ cache_clear = getattr(self.hash_fn, "cache_clear", None)
143
+ if callable(cache_clear):
144
+ cache_clear()
145
+
146
+ def hash_cache_info(self):
147
+ cache_info = getattr(self.hash_fn, "cache_info", None)
148
+ if callable(cache_info):
149
+ return cache_info()
150
+ return None
125
151
 
126
152
  def process_numeric_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
127
153
  name = str(data.name)
@@ -132,21 +158,22 @@ class DataProcessor(FeatureSet):
132
158
  # Default use mean value to fill missing values for numeric features
133
159
  fill_na = data.mean()
134
160
  config["fill_na_value"] = fill_na
135
- if scaler_type == "standard":
136
- scaler = StandardScaler()
137
- elif scaler_type == "minmax":
138
- scaler = MinMaxScaler()
139
- elif scaler_type == "robust":
140
- scaler = RobustScaler()
141
- elif scaler_type == "maxabs":
142
- scaler = MaxAbsScaler()
143
- elif scaler_type == "log":
144
- scaler = None
145
- elif scaler_type == "none":
161
+ scaler_map = {
162
+ "standard": StandardScaler,
163
+ "minmax": MinMaxScaler,
164
+ "robust": RobustScaler,
165
+ "maxabs": MaxAbsScaler,
166
+ }
167
+ if scaler_type in ("log", "none"):
146
168
  scaler = None
147
169
  else:
148
- raise ValueError(f"Unknown scaler type: {scaler_type}")
149
- if scaler is not None and scaler_type != "log":
170
+ scaler_cls = scaler_map.get(scaler_type)
171
+ if scaler_cls is None:
172
+ raise ValueError(
173
+ f"[Data Processor Error] Unknown scaler type: {scaler_type}"
174
+ )
175
+ scaler = scaler_cls()
176
+ if scaler is not None:
150
177
  filled_data = data.fillna(config.get("fill_na_value", 0))
151
178
  values = np.array(filled_data.values, dtype=np.float64).reshape(-1, 1)
152
179
  scaler.fit(values)
@@ -177,15 +204,18 @@ class DataProcessor(FeatureSet):
177
204
  return result
178
205
 
179
206
  def process_sparse_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
180
- name = str(data.name)
207
+ _ = str(data.name)
181
208
  encode_method = config["encode_method"]
182
209
  fill_na = config["fill_na"] # <UNK>
183
210
  filled_data = data.fillna(fill_na).astype(str)
184
211
  if encode_method == "label":
185
- le = LabelEncoder()
186
- le.fit(filled_data)
187
- self.label_encoders[name] = le
188
- config["vocab_size"] = len(le.classes_)
212
+ vocab = sorted(set(filled_data.tolist()))
213
+ if "<UNK>" not in vocab:
214
+ vocab.append("<UNK>")
215
+ token_to_idx = {token: idx for idx, token in enumerate(vocab)}
216
+ config["_token_to_idx"] = token_to_idx
217
+ config["_unk_index"] = token_to_idx["<UNK>"]
218
+ config["vocab_size"] = len(vocab)
189
219
  elif encode_method == "hash":
190
220
  config["vocab_size"] = config["hash_size"]
191
221
 
@@ -195,18 +225,32 @@ class DataProcessor(FeatureSet):
195
225
  name = str(data.name)
196
226
  encode_method = config["encode_method"]
197
227
  fill_na = config["fill_na"]
198
- sparse_series = pd.Series(data, name=name).fillna(fill_na).astype(str)
228
+
229
+ sparse_series = (
230
+ data if isinstance(data, pd.Series) else pd.Series(data, name=name)
231
+ )
232
+ sparse_series = sparse_series.fillna(fill_na).astype(str)
199
233
  if encode_method == "label":
234
+ token_to_idx = config.get("_token_to_idx")
235
+ if isinstance(token_to_idx, dict):
236
+ unk_index = int(config.get("_unk_index", 0))
237
+ return np.fromiter(
238
+ (token_to_idx.get(v, unk_index) for v in sparse_series.to_numpy()),
239
+ dtype=np.int64,
240
+ count=sparse_series.size,
241
+ )
200
242
  le = self.label_encoders.get(name)
201
243
  if le is None:
202
- raise ValueError(f"LabelEncoder for {name} not fitted")
203
- class_to_idx = config.get("_class_to_idx")
204
- if class_to_idx is None:
205
- class_to_idx = {cls: idx for idx, cls in enumerate(le.classes_)}
206
- config["_class_to_idx"] = class_to_idx
207
- encoded = sparse_series.map(class_to_idx)
208
- encoded = encoded.fillna(0).astype(np.int64)
209
- return encoded.to_numpy()
244
+ raise ValueError(
245
+ f"[Data Processor Error] LabelEncoder for {name} not fitted"
246
+ )
247
+ cat = pd.Categorical(sparse_series, categories=le.classes_)
248
+ codes = cat.codes # -1 indicates unknown category
249
+ unk_index = 0
250
+ if "<UNK>" in le.classes_:
251
+ unk_index = int(list(le.classes_).index("<UNK>"))
252
+ return np.where(codes < 0, unk_index, codes).astype(np.int64, copy=False)
253
+
210
254
  if encode_method == "hash":
211
255
  hash_size = config["hash_size"]
212
256
  hash_fn = self.hash_string
@@ -218,35 +262,22 @@ class DataProcessor(FeatureSet):
218
262
  return np.array([], dtype=np.int64)
219
263
 
220
264
  def process_sequence_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
221
- name = str(data.name)
265
+ _ = str(data.name)
222
266
  encode_method = config["encode_method"]
223
267
  separator = config["separator"]
224
268
  if encode_method == "label":
225
269
  all_tokens = set()
226
270
  for seq in data:
227
- # Skip None, np.nan, and empty strings
228
- if seq is None:
229
- continue
230
- if isinstance(seq, (float, np.floating)) and np.isnan(seq):
231
- continue
232
- if isinstance(seq, str) and seq.strip() == "":
233
- continue
234
-
235
- if isinstance(seq, str):
236
- tokens = seq.split(separator)
237
- elif isinstance(seq, (list, tuple)):
238
- tokens = [str(t) for t in seq]
239
- elif isinstance(seq, np.ndarray):
240
- tokens = [str(t) for t in seq.tolist()]
241
- else:
242
- continue
243
- all_tokens.update(tokens)
244
- if len(all_tokens) == 0:
245
- all_tokens.add("<PAD>")
246
- le = LabelEncoder()
247
- le.fit(list(all_tokens))
248
- self.label_encoders[name] = le
249
- config["vocab_size"] = len(le.classes_)
271
+ all_tokens.update(self.extract_sequence_tokens(seq, separator))
272
+ vocab = sorted(all_tokens)
273
+ if not vocab:
274
+ vocab = ["<PAD>"]
275
+ if "<UNK>" not in vocab:
276
+ vocab.append("<UNK>")
277
+ token_to_idx = {token: idx for idx, token in enumerate(vocab)}
278
+ config["_token_to_idx"] = token_to_idx
279
+ config["_unk_index"] = token_to_idx["<UNK>"]
280
+ config["vocab_size"] = len(vocab)
250
281
  elif encode_method == "hash":
251
282
  config["vocab_size"] = config["hash_size"]
252
283
 
@@ -267,15 +298,17 @@ class DataProcessor(FeatureSet):
267
298
  split_fn = str.split
268
299
  is_nan = np.isnan
269
300
  if encode_method == "label":
270
- le = self.label_encoders.get(name)
271
- if le is None:
272
- raise ValueError(f"LabelEncoder for {name} not fitted")
273
- class_to_idx = config.get("_class_to_idx")
301
+ class_to_idx = config.get("_token_to_idx") or config.get("_class_to_idx")
274
302
  if class_to_idx is None:
303
+ le = self.label_encoders.get(name)
304
+ if le is None:
305
+ raise ValueError(f"LabelEncoder for {name} not fitted")
275
306
  class_to_idx = {cls: idx for idx, cls in enumerate(le.classes_)}
276
307
  config["_class_to_idx"] = class_to_idx
308
+ unk_index = int(config.get("_unk_index", class_to_idx.get("<UNK>", 0)))
277
309
  else:
278
310
  class_to_idx = None # type: ignore
311
+ unk_index = 0
279
312
  hash_fn = self.hash_string
280
313
  hash_size = config.get("hash_size")
281
314
  for i, seq in enumerate(arr):
@@ -294,13 +327,15 @@ class DataProcessor(FeatureSet):
294
327
  tokens = []
295
328
  if encode_method == "label":
296
329
  encoded = [
297
- class_to_idx.get(token.strip(), 0) # type: ignore[union-attr]
330
+ class_to_idx.get(token.strip(), unk_index) # type: ignore[union-attr]
298
331
  for token in tokens
299
332
  if token is not None and token != ""
300
333
  ]
301
334
  elif encode_method == "hash":
302
335
  if hash_size is None:
303
- raise ValueError("hash_size must be set for hash encoding")
336
+ raise ValueError(
337
+ "[Data Processor Error] hash_size must be set for hash encoding"
338
+ )
304
339
  encoded = [
305
340
  hash_fn(str(token), hash_size)
306
341
  for token in tokens
@@ -319,7 +354,7 @@ class DataProcessor(FeatureSet):
319
354
  name = str(data.name)
320
355
  target_type = config["target_type"]
321
356
  label_map = config.get("label_map")
322
- if target_type in ["binary", "multiclass"]:
357
+ if target_type == "binary":
323
358
  if label_map is None:
324
359
  unique_values = data.dropna().unique()
325
360
  sorted_values = sorted(unique_values)
@@ -345,10 +380,12 @@ class DataProcessor(FeatureSet):
345
380
  if target_type == "regression":
346
381
  values = np.array(data.values, dtype=np.float32)
347
382
  return values
348
- else:
383
+ if target_type == "binary":
349
384
  label_map = self.target_encoders.get(name)
350
385
  if label_map is None:
351
- raise ValueError(f"Target encoder for {name} not fitted")
386
+ raise ValueError(
387
+ f"[Data Processor Error] Target encoder for {name} not fitted"
388
+ )
352
389
  result = []
353
390
  for val in data:
354
391
  str_val = str(val)
@@ -357,9 +394,10 @@ class DataProcessor(FeatureSet):
357
394
  else:
358
395
  logger.warning(f"Unknown target value: {val}, mapping to 0")
359
396
  result.append(0)
360
- return np.array(
361
- result, dtype=np.int64 if target_type == "multiclass" else np.float32
362
- )
397
+ return np.array(result, dtype=np.float32)
398
+ raise ValueError(
399
+ f"[Data Processor Error] Unsupported target type: {target_type}"
400
+ )
363
401
 
364
402
  def load_dataframe_from_path(self, path: str) -> pd.DataFrame:
365
403
  """Load all data from a file or directory path into a single DataFrame."""
@@ -414,51 +452,47 @@ class DataProcessor(FeatureSet):
414
452
  missing_features = set()
415
453
  for file_path in file_paths:
416
454
  for chunk in iter_file_chunks(file_path, file_type, chunk_size):
417
- # numeric features
418
- for name, config in self.numeric_features.items():
419
- if name not in chunk.columns:
420
- missing_features.add(name)
421
- continue
422
- series = chunk[name]
423
- values = pd.to_numeric(series, errors="coerce")
424
- values = values.dropna()
425
- if values.empty:
426
- continue
427
- acc = numeric_acc[name]
428
- arr = values.to_numpy(dtype=np.float64, copy=False)
429
- acc["count"] += arr.size
430
- acc["sum"] += float(arr.sum())
431
- acc["sumsq"] += float(np.square(arr).sum())
432
- acc["min"] = min(acc["min"], float(arr.min()))
433
- acc["max"] = max(acc["max"], float(arr.max()))
434
- acc["max_abs"] = max(acc["max_abs"], float(np.abs(arr).max()))
435
-
436
- # sparse features
437
- for name, config in self.sparse_features.items():
438
- if name not in chunk.columns:
439
- missing_features.add(name)
440
- continue
441
- fill_na = config["fill_na"]
442
- series = chunk[name].fillna(fill_na).astype(str)
443
- sparse_vocab[name].update(series.tolist())
444
-
445
- # sequence features
446
- for name, config in self.sequence_features.items():
447
- if name not in chunk.columns:
448
- missing_features.add(name)
449
- continue
450
- separator = config["separator"]
451
- series = chunk[name]
452
- tokens = []
453
- for val in series:
454
- tokens.extend(self.extract_sequence_tokens(val, separator))
455
- seq_vocab[name].update(tokens)
455
+ columns = set(chunk.columns)
456
+ feature_groups = [
457
+ ("numeric", self.numeric_features),
458
+ ("sparse", self.sparse_features),
459
+ ("sequence", self.sequence_features),
460
+ ]
461
+ for group, features in feature_groups:
462
+ missing_features.update(features.keys() - columns)
463
+ for name in features.keys() & columns:
464
+ config = features[name]
465
+ series = chunk[name]
466
+ if group == "numeric":
467
+ values = pd.to_numeric(series, errors="coerce").dropna()
468
+ if values.empty:
469
+ continue
470
+ acc = numeric_acc[name]
471
+ arr = values.to_numpy(dtype=np.float64, copy=False)
472
+ acc["count"] += arr.size
473
+ acc["sum"] += float(arr.sum())
474
+ acc["sumsq"] += float(np.square(arr).sum())
475
+ acc["min"] = min(acc["min"], float(arr.min()))
476
+ acc["max"] = max(acc["max"], float(arr.max()))
477
+ acc["max_abs"] = max(
478
+ acc["max_abs"], float(np.abs(arr).max())
479
+ )
480
+ elif group == "sparse":
481
+ fill_na = config["fill_na"]
482
+ series = series.fillna(fill_na).astype(str)
483
+ sparse_vocab[name].update(series.tolist())
484
+ else:
485
+ separator = config["separator"]
486
+ tokens = []
487
+ for val in series:
488
+ tokens.extend(
489
+ self.extract_sequence_tokens(val, separator)
490
+ )
491
+ seq_vocab[name].update(tokens)
456
492
 
457
493
  # target features
458
- for name in self.target_features.keys():
459
- if name not in chunk.columns:
460
- missing_features.add(name)
461
- continue
494
+ missing_features.update(self.target_features.keys() - columns)
495
+ for name in self.target_features.keys() & columns:
462
496
  vals = chunk[name].dropna().tolist()
463
497
  target_values[name].update(vals)
464
498
  if missing_features:
@@ -489,6 +523,7 @@ class DataProcessor(FeatureSet):
489
523
  )
490
524
  scaler.n_samples_seen_ = np.array([int(acc["count"])], dtype=np.int64)
491
525
  self.scalers[name] = scaler
526
+
492
527
  elif scaler_type == "minmax":
493
528
  data_min = acc["min"] if np.isfinite(acc["min"]) else 0.0
494
529
  data_max = acc["max"] if np.isfinite(acc["max"]) else data_min
@@ -504,11 +539,13 @@ class DataProcessor(FeatureSet):
504
539
  scaler.min_ = feature_min - scaler.data_min_ * scale
505
540
  scaler.n_samples_seen_ = np.array([int(acc["count"])], dtype=np.int64)
506
541
  self.scalers[name] = scaler
542
+
507
543
  elif scaler_type == "maxabs":
508
544
  scaler = MaxAbsScaler()
509
545
  scaler.max_abs_ = np.array([acc["max_abs"]], dtype=np.float64)
510
546
  scaler.n_samples_seen_ = np.array([int(acc["count"])], dtype=np.int64)
511
547
  self.scalers[name] = scaler
548
+
512
549
  elif scaler_type in ("log", "none", "robust"):
513
550
  # log and none do not require fitting; robust requires full data and is handled earlier
514
551
  continue
@@ -522,21 +559,27 @@ class DataProcessor(FeatureSet):
522
559
  if not vocab:
523
560
  logger.warning(f"Sparse feature {name} has empty vocabulary")
524
561
  continue
525
- le = LabelEncoder()
526
- le.fit(list(vocab))
527
- self.label_encoders[name] = le
528
- config["vocab_size"] = len(le.classes_)
562
+ vocab_list = sorted(vocab)
563
+ if "<UNK>" not in vocab_list:
564
+ vocab_list.append("<UNK>")
565
+ token_to_idx = {token: idx for idx, token in enumerate(vocab_list)}
566
+ config["_token_to_idx"] = token_to_idx
567
+ config["_unk_index"] = token_to_idx["<UNK>"]
568
+ config["vocab_size"] = len(vocab_list)
529
569
  elif config["encode_method"] == "hash":
530
570
  config["vocab_size"] = config["hash_size"]
531
571
 
532
572
  # finalize sequence vocabularies
533
573
  for name, config in self.sequence_features.items():
534
574
  if config["encode_method"] == "label":
535
- vocab = seq_vocab[name] or {"<PAD>"}
536
- le = LabelEncoder()
537
- le.fit(list(vocab))
538
- self.label_encoders[name] = le
539
- config["vocab_size"] = len(le.classes_)
575
+ vocab_set = seq_vocab[name]
576
+ vocab_list = sorted(vocab_set) if vocab_set else ["<PAD>"]
577
+ if "<UNK>" not in vocab_list:
578
+ vocab_list.append("<UNK>")
579
+ token_to_idx = {token: idx for idx, token in enumerate(vocab_list)}
580
+ config["_token_to_idx"] = token_to_idx
581
+ config["_unk_index"] = token_to_idx["<UNK>"]
582
+ config["vocab_size"] = len(vocab_list)
540
583
  elif config["encode_method"] == "hash":
541
584
  config["vocab_size"] = config["hash_size"]
542
585
 
@@ -545,37 +588,14 @@ class DataProcessor(FeatureSet):
545
588
  if not target_values[name]:
546
589
  logger.warning(f"Target {name} has no valid values in provided files")
547
590
  continue
548
-
549
- target_type = config["target_type"]
550
- if target_type in ["binary", "multiclass"]:
551
- unique_values = list(target_values[name])
552
- try:
553
- sorted_values = sorted(unique_values)
554
- except TypeError:
555
- sorted_values = sorted(unique_values, key=lambda x: str(x))
556
-
557
- label_map = config["label_map"]
558
- if label_map is None:
559
- try:
560
- int_values = [int(v) for v in sorted_values]
561
- if int_values == list(range(len(int_values))):
562
- label_map = {str(val): int(val) for val in sorted_values}
563
- else:
564
- label_map = {
565
- str(val): idx for idx, val in enumerate(sorted_values)
566
- }
567
- except (ValueError, TypeError):
568
- label_map = {
569
- str(val): idx for idx, val in enumerate(sorted_values)
570
- }
571
- config["label_map"] = label_map
572
-
573
- self.target_encoders[name] = label_map
591
+ self.process_target_fit(
592
+ pd.Series(list(target_values[name]), name=name), config
593
+ )
574
594
 
575
595
  self.is_fitted = True
576
596
  logger.info(
577
597
  colorize(
578
- "DataProcessor fitted successfully (streaming path mode)",
598
+ "DataProcessor fitted successfully",
579
599
  color="green",
580
600
  bold=True,
581
601
  )
@@ -589,69 +609,59 @@ class DataProcessor(FeatureSet):
589
609
  persist: bool,
590
610
  save_format: Optional[Literal["csv", "parquet"]],
591
611
  output_path: Optional[str],
612
+ warn_missing: bool = True,
592
613
  ) -> Union[pd.DataFrame, Dict[str, np.ndarray]]:
593
614
  logger = logging.getLogger()
594
- # Convert input to dict format for unified processing
595
- if isinstance(data, pd.DataFrame):
596
- data_dict = {col: data[col] for col in data.columns}
597
- elif isinstance(data, dict):
598
- data_dict = data
615
+ is_dataframe = isinstance(data, pd.DataFrame)
616
+ data_dict = data if isinstance(data, dict) else None
617
+
618
+ result_dict: Dict[str, np.ndarray] = {}
619
+ if is_dataframe:
620
+ df: pd.DataFrame = data # type: ignore[assignment]
621
+ for col in df.columns:
622
+ result_dict[col] = df[col].to_numpy(copy=False)
599
623
  else:
600
- raise ValueError(f"Unsupported data type: {type(data)}")
601
-
602
- result_dict = {}
603
- for key, value in data_dict.items():
604
- if isinstance(value, pd.Series):
605
- result_dict[key] = value.values
606
- elif isinstance(value, np.ndarray):
607
- result_dict[key] = value
608
- else:
609
- result_dict[key] = np.array(value)
610
-
611
- # process numeric features
612
- for name, config in self.numeric_features.items():
613
- if name not in data_dict:
614
- logger.warning(f"Numeric feature {name} not found in data")
615
- continue
616
- # Convert to Series for processing
617
- series_data = pd.Series(data_dict[name], name=name)
618
- processed = self.process_numeric_feature_transform(series_data, config)
619
- result_dict[name] = processed
620
-
621
- # process sparse features
622
- for name, config in self.sparse_features.items():
623
- if name not in data_dict:
624
- logger.warning(f"Sparse feature {name} not found in data")
625
- continue
626
- series_data = pd.Series(data_dict[name], name=name)
627
- processed = self.process_sparse_feature_transform(series_data, config)
628
- result_dict[name] = processed
629
-
630
- # process sequence features
631
- for name, config in self.sequence_features.items():
632
- if name not in data_dict:
633
- logger.warning(f"Sequence feature {name} not found in data")
634
- continue
635
- series_data = pd.Series(data_dict[name], name=name)
636
- processed = self.process_sequence_feature_transform(series_data, config)
637
- result_dict[name] = processed
638
-
639
- # process target features
640
- for name, config in self.target_features.items():
641
- if name not in data_dict:
642
- logger.warning(f"Target {name} not found in data")
643
- continue
644
- series_data = pd.Series(data_dict[name], name=name)
645
- processed = self.process_target_transform(series_data, config)
646
- result_dict[name] = processed
624
+ if data_dict is None:
625
+ raise ValueError(
626
+ f"[Data Processor Error] Unsupported data type: {type(data)}"
627
+ )
628
+ for key, value in data_dict.items():
629
+ if isinstance(value, pd.Series):
630
+ result_dict[key] = value.to_numpy(copy=False)
631
+ else:
632
+ result_dict[key] = np.asarray(value)
633
+
634
+ data_columns = data.columns if is_dataframe else data_dict
635
+ feature_groups = [
636
+ ("Numeric", self.numeric_features, self.process_numeric_feature_transform),
637
+ ("Sparse", self.sparse_features, self.process_sparse_feature_transform),
638
+ (
639
+ "Sequence",
640
+ self.sequence_features,
641
+ self.process_sequence_feature_transform,
642
+ ),
643
+ ("Target", self.target_features, self.process_target_transform),
644
+ ]
645
+ for label, features, transform_fn in feature_groups:
646
+ for name, config in features.items():
647
+ present = name in data_columns # type: ignore[operator]
648
+ if not present:
649
+ if warn_missing:
650
+ logger.warning(f"{label} feature {name} not found in data")
651
+ continue
652
+ series_data = (
653
+ data[name]
654
+ if is_dataframe
655
+ else pd.Series(result_dict[name], name=name)
656
+ )
657
+ result_dict[name] = transform_fn(series_data, config)
647
658
 
648
659
  def dict_to_dataframe(result: Dict[str, np.ndarray]) -> pd.DataFrame:
649
660
  # Convert all arrays to Series/lists at once to avoid fragmentation
650
661
  columns_dict = {}
651
662
  for key, value in result.items():
652
663
  if key in self.sequence_features:
653
- # Use tolist to coerce numpy scalars to native Python ints for stable CSV rendering
654
- columns_dict[key] = [np.asarray(seq).tolist() for seq in value]
664
+ columns_dict[key] = np.asarray(value).tolist()
655
665
  else:
656
666
  columns_dict[key] = value
657
667
  return pd.DataFrame(columns_dict)
@@ -667,7 +677,7 @@ class DataProcessor(FeatureSet):
667
677
  if persist:
668
678
  if output_path is None:
669
679
  raise ValueError(
670
- "output_path must be provided when persisting transformed data."
680
+ "[Data Processor Error] output_path must be provided when persisting transformed data."
671
681
  )
672
682
  output_dir = Path(output_path)
673
683
  if output_dir.suffix:
@@ -694,8 +704,12 @@ class DataProcessor(FeatureSet):
694
704
  input_path: str,
695
705
  output_path: Optional[str],
696
706
  save_format: Optional[Literal["csv", "parquet"]],
707
+ chunk_size: int = 200000,
697
708
  ) -> list[str]:
698
- """Transform data from files under a path and save them to a new location."""
709
+ """Transform data from files under a path and save them to a new location.
710
+
711
+ Uses chunked reading/writing to keep peak memory bounded for large files.
712
+ """
699
713
  logger = logging.getLogger()
700
714
  file_paths, file_type = resolve_file_paths(input_path)
701
715
  target_format = save_format or file_type
@@ -709,20 +723,82 @@ class DataProcessor(FeatureSet):
709
723
  output_root = base_output_dir / "transformed_data"
710
724
  output_root.mkdir(parents=True, exist_ok=True)
711
725
  saved_paths = []
712
- for file_path in tqdm.tqdm(file_paths, desc="Transforming files", unit="file"):
713
- df = read_table(file_path, file_type)
714
- transformed_df = self.transform_in_memory(
715
- df, return_dict=False, persist=False, save_format=None, output_path=None
716
- )
717
- assert isinstance(
718
- transformed_df, pd.DataFrame
719
- ), "Expected DataFrame when return_dict=False"
726
+ for file_path in progress(file_paths, description="Transforming files"):
720
727
  source_path = Path(file_path)
721
728
  target_file = output_root / f"{source_path.stem}.{target_format}"
722
- if target_format == "csv":
723
- transformed_df.to_csv(target_file, index=False)
729
+
730
+ # Stream transform for large files
731
+
732
+ if chunk_size <= 0:
733
+ # fallback to full load behavior
734
+ df = read_table(file_path, file_type)
735
+ transformed_df = self.transform_in_memory(
736
+ df,
737
+ return_dict=False,
738
+ persist=False,
739
+ save_format=None,
740
+ output_path=None,
741
+ warn_missing=True,
742
+ )
743
+ assert isinstance(
744
+ transformed_df, pd.DataFrame
745
+ ), "[Data Processor Error] Expected DataFrame when return_dict=False"
746
+ if target_format == "csv":
747
+ transformed_df.to_csv(target_file, index=False)
748
+ else:
749
+ transformed_df.to_parquet(target_file, index=False)
750
+ saved_paths.append(str(target_file.resolve()))
751
+ continue
752
+
753
+ first_chunk = True
754
+ if target_format == "parquet":
755
+ writer: pq.ParquetWriter | None = None
756
+ try:
757
+ for chunk in iter_file_chunks(file_path, file_type, chunk_size):
758
+ transformed_df = self.transform_in_memory(
759
+ chunk,
760
+ return_dict=False,
761
+ persist=False,
762
+ save_format=None,
763
+ output_path=None,
764
+ warn_missing=first_chunk,
765
+ )
766
+ assert isinstance(
767
+ transformed_df, pd.DataFrame
768
+ ), "[Data Processor Error] Expected DataFrame when return_dict=False"
769
+ table = pa.Table.from_pandas(
770
+ transformed_df, preserve_index=False
771
+ )
772
+ if writer is None:
773
+ writer = pq.ParquetWriter(target_file, table.schema)
774
+ writer.write_table(table)
775
+ first_chunk = False
776
+ finally:
777
+ if writer is not None:
778
+ writer.close()
724
779
  else:
725
- transformed_df.to_parquet(target_file, index=False)
780
+ # CSV: append chunks; header only once
781
+ # (truncate first to avoid mixing with existing files)
782
+ target_file.parent.mkdir(parents=True, exist_ok=True)
783
+ with open(target_file, "w", encoding="utf-8", newline="") as f:
784
+ f.write("")
785
+ for chunk in iter_file_chunks(file_path, file_type, chunk_size):
786
+ transformed_df = self.transform_in_memory(
787
+ chunk,
788
+ return_dict=False,
789
+ persist=False,
790
+ save_format=None,
791
+ output_path=None,
792
+ warn_missing=first_chunk,
793
+ )
794
+ assert isinstance(
795
+ transformed_df, pd.DataFrame
796
+ ), "[Data Processor Error] Expected DataFrame when return_dict=False"
797
+ transformed_df.to_csv(
798
+ target_file, index=False, mode="a", header=first_chunk
799
+ )
800
+ first_chunk = False
801
+
726
802
  saved_paths.append(str(target_file.resolve()))
727
803
  logger.info(
728
804
  colorize(
@@ -754,26 +830,18 @@ class DataProcessor(FeatureSet):
754
830
  if isinstance(data, dict):
755
831
  data = pd.DataFrame(data)
756
832
  logger.info(colorize("Fitting DataProcessor...", color="cyan", bold=True))
757
- for name, config in self.numeric_features.items():
758
- if name not in data.columns:
759
- logger.warning(f"Numeric feature {name} not found in data")
760
- continue
761
- self.process_numeric_feature_fit(data[name], config)
762
- for name, config in self.sparse_features.items():
763
- if name not in data.columns:
764
- logger.warning(f"Sparse feature {name} not found in data")
765
- continue
766
- self.process_sparse_feature_fit(data[name], config)
767
- for name, config in self.sequence_features.items():
768
- if name not in data.columns:
769
- logger.warning(f"Sequence feature {name} not found in data")
770
- continue
771
- self.process_sequence_feature_fit(data[name], config)
772
- for name, config in self.target_features.items():
773
- if name not in data.columns:
774
- logger.warning(f"Target {name} not found in data")
775
- continue
776
- self.process_target_fit(data[name], config)
833
+ feature_groups = [
834
+ ("Numeric", self.numeric_features, self.process_numeric_feature_fit),
835
+ ("Sparse", self.sparse_features, self.process_sparse_feature_fit),
836
+ ("Sequence", self.sequence_features, self.process_sequence_feature_fit),
837
+ ("Target", self.target_features, self.process_target_fit),
838
+ ]
839
+ for label, features, fit_fn in feature_groups:
840
+ for name, config in features.items():
841
+ if name not in data.columns:
842
+ logger.warning(f"{label} feature {name} not found in data")
843
+ continue
844
+ fit_fn(data[name], config)
777
845
  self.is_fitted = True
778
846
  return self
779
847
 
@@ -783,15 +851,20 @@ class DataProcessor(FeatureSet):
783
851
  return_dict: bool = True,
784
852
  save_format: Optional[Literal["csv", "parquet"]] = None,
785
853
  output_path: Optional[str] = None,
854
+ chunk_size: int = 200000,
786
855
  ) -> Union[pd.DataFrame, Dict[str, np.ndarray], list[str]]:
787
856
  if not self.is_fitted:
788
- raise ValueError("DataProcessor must be fitted before transform")
857
+ raise ValueError(
858
+ "[Data Processor Error] DataProcessor must be fitted before transform"
859
+ )
789
860
  if isinstance(data, (str, os.PathLike)):
790
861
  if return_dict:
791
862
  raise ValueError(
792
- "Path transform writes files only; set return_dict=False when passing a path."
863
+ "[Data Processor Error] Path transform writes files only; set return_dict=False when passing a path."
793
864
  )
794
- return self.transform_path(str(data), output_path, save_format)
865
+ return self.transform_path(
866
+ str(data), output_path, save_format, chunk_size=chunk_size
867
+ )
795
868
  return self.transform_in_memory(
796
869
  data=data,
797
870
  return_dict=return_dict,