nextrec 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nextrec/__version__.py +1 -1
- nextrec/basic/features.py +5 -1
- nextrec/basic/layers.py +3 -7
- nextrec/basic/model.py +495 -664
- nextrec/data/data_utils.py +44 -12
- nextrec/data/dataloader.py +84 -285
- nextrec/data/preprocessor.py +91 -213
- nextrec/loss/__init__.py +0 -1
- nextrec/loss/loss_utils.py +51 -120
- nextrec/models/multi_task/esmm.py +1 -1
- nextrec/models/ranking/masknet.py +1 -1
- nextrec/utils/__init__.py +4 -1
- nextrec/utils/common.py +16 -0
- {nextrec-0.2.4.dist-info → nextrec-0.2.5.dist-info}/METADATA +2 -2
- {nextrec-0.2.4.dist-info → nextrec-0.2.5.dist-info}/RECORD +17 -16
- {nextrec-0.2.4.dist-info → nextrec-0.2.5.dist-info}/WHEEL +0 -0
- {nextrec-0.2.4.dist-info → nextrec-0.2.5.dist-info}/licenses/LICENSE +0 -0
nextrec/data/preprocessor.py
CHANGED
|
@@ -30,8 +30,10 @@ from nextrec.data.data_utils import (
|
|
|
30
30
|
load_dataframes,
|
|
31
31
|
default_output_dir,
|
|
32
32
|
)
|
|
33
|
-
from nextrec.basic.session import
|
|
33
|
+
from nextrec.basic.session import resolve_save_path
|
|
34
34
|
from nextrec.basic.features import FeatureSpecMixin
|
|
35
|
+
from nextrec.__version__ import __version__
|
|
36
|
+
|
|
35
37
|
|
|
36
38
|
class DataProcessor(FeatureSpecMixin):
|
|
37
39
|
"""DataProcessor for data preprocessing including numeric, sparse, sequence features and target processing.
|
|
@@ -54,28 +56,21 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
54
56
|
>>> # Get vocabulary sizes for embedding layers
|
|
55
57
|
>>> vocab_sizes = processor.get_vocab_sizes()
|
|
56
58
|
"""
|
|
57
|
-
def __init__(self
|
|
59
|
+
def __init__(self):
|
|
58
60
|
self.numeric_features: Dict[str, Dict[str, Any]] = {}
|
|
59
61
|
self.sparse_features: Dict[str, Dict[str, Any]] = {}
|
|
60
62
|
self.sequence_features: Dict[str, Dict[str, Any]] = {}
|
|
61
63
|
self.target_features: Dict[str, Dict[str, Any]] = {}
|
|
62
|
-
self.
|
|
63
|
-
|
|
64
|
-
|
|
64
|
+
self.version = __version__
|
|
65
|
+
|
|
65
66
|
self.is_fitted = False
|
|
66
67
|
self._transform_summary_printed = False # Track if summary has been printed during transform
|
|
67
68
|
|
|
68
69
|
self.scalers: Dict[str, Any] = {}
|
|
69
70
|
self.label_encoders: Dict[str, LabelEncoder] = {}
|
|
70
71
|
self.target_encoders: Dict[str, Dict[str, int]] = {}
|
|
71
|
-
self.
|
|
72
|
-
|
|
73
|
-
# Initialize logger if not already initialized
|
|
74
|
-
self._logger_initialized = False
|
|
75
|
-
if not logging.getLogger().hasHandlers():
|
|
76
|
-
setup_logger(session_id=self.session_id)
|
|
77
|
-
self._logger_initialized = True
|
|
78
|
-
|
|
72
|
+
self._set_target_id_config([], [])
|
|
73
|
+
|
|
79
74
|
def add_numeric_feature(
|
|
80
75
|
self,
|
|
81
76
|
name: str,
|
|
@@ -96,7 +91,6 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
96
91
|
):
|
|
97
92
|
if encode_method == 'hash' and hash_size is None:
|
|
98
93
|
raise ValueError("hash_size must be specified when encode_method='hash'")
|
|
99
|
-
|
|
100
94
|
self.sparse_features[name] = {
|
|
101
95
|
'encode_method': encode_method,
|
|
102
96
|
'hash_size': hash_size,
|
|
@@ -113,10 +107,8 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
113
107
|
truncate: Literal['pre', 'post'] = 'pre', # pre: keep last max_len items, post: keep first max_len items
|
|
114
108
|
separator: str = ','
|
|
115
109
|
):
|
|
116
|
-
|
|
117
110
|
if encode_method == 'hash' and hash_size is None:
|
|
118
111
|
raise ValueError("hash_size must be specified when encode_method='hash'")
|
|
119
|
-
|
|
120
112
|
self.sequence_features[name] = {
|
|
121
113
|
'encode_method': encode_method,
|
|
122
114
|
'hash_size': hash_size,
|
|
@@ -136,23 +128,20 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
136
128
|
'target_type': target_type,
|
|
137
129
|
'label_map': label_map
|
|
138
130
|
}
|
|
139
|
-
self.
|
|
131
|
+
self._set_target_id_config(list(self.target_features.keys()), [])
|
|
140
132
|
|
|
141
133
|
def _hash_string(self, s: str, hash_size: int) -> int:
|
|
142
134
|
return int(hashlib.md5(str(s).encode()).hexdigest(), 16) % hash_size
|
|
143
135
|
|
|
144
136
|
def _process_numeric_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
|
|
145
|
-
|
|
146
137
|
name = str(data.name)
|
|
147
138
|
scaler_type = config['scaler']
|
|
148
139
|
fill_na = config['fill_na']
|
|
149
|
-
|
|
150
140
|
if data.isna().any():
|
|
151
141
|
if fill_na is None:
|
|
152
142
|
# Default use mean value to fill missing values for numeric features
|
|
153
143
|
fill_na = data.mean()
|
|
154
144
|
config['fill_na_value'] = fill_na
|
|
155
|
-
|
|
156
145
|
if scaler_type == 'standard':
|
|
157
146
|
scaler = StandardScaler()
|
|
158
147
|
elif scaler_type == 'minmax':
|
|
@@ -167,27 +156,19 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
167
156
|
scaler = None
|
|
168
157
|
else:
|
|
169
158
|
raise ValueError(f"Unknown scaler type: {scaler_type}")
|
|
170
|
-
|
|
171
159
|
if scaler is not None and scaler_type != 'log':
|
|
172
160
|
filled_data = data.fillna(config.get('fill_na_value', 0))
|
|
173
161
|
values = np.array(filled_data.values, dtype=np.float64).reshape(-1, 1)
|
|
174
162
|
scaler.fit(values)
|
|
175
163
|
self.scalers[name] = scaler
|
|
176
164
|
|
|
177
|
-
def _process_numeric_feature_transform(
|
|
178
|
-
self,
|
|
179
|
-
data: pd.Series,
|
|
180
|
-
config: Dict[str, Any]
|
|
181
|
-
) -> np.ndarray:
|
|
165
|
+
def _process_numeric_feature_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
|
|
182
166
|
logger = logging.getLogger()
|
|
183
|
-
|
|
184
167
|
name = str(data.name)
|
|
185
168
|
scaler_type = config['scaler']
|
|
186
169
|
fill_na_value = config.get('fill_na_value', 0)
|
|
187
|
-
|
|
188
170
|
filled_data = data.fillna(fill_na_value)
|
|
189
171
|
values = np.array(filled_data.values, dtype=np.float64)
|
|
190
|
-
|
|
191
172
|
if scaler_type == 'log':
|
|
192
173
|
result = np.log1p(np.maximum(values, 0))
|
|
193
174
|
elif scaler_type == 'none':
|
|
@@ -199,17 +180,13 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
199
180
|
result = values
|
|
200
181
|
else:
|
|
201
182
|
result = scaler.transform(values.reshape(-1, 1)).ravel()
|
|
202
|
-
|
|
203
183
|
return result
|
|
204
184
|
|
|
205
185
|
def _process_sparse_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
|
|
206
|
-
|
|
207
186
|
name = str(data.name)
|
|
208
187
|
encode_method = config['encode_method']
|
|
209
188
|
fill_na = config['fill_na'] # <UNK>
|
|
210
|
-
|
|
211
189
|
filled_data = data.fillna(fill_na).astype(str)
|
|
212
|
-
|
|
213
190
|
if encode_method == 'label':
|
|
214
191
|
le = LabelEncoder()
|
|
215
192
|
le.fit(filled_data)
|
|
@@ -218,49 +195,32 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
218
195
|
elif encode_method == 'hash':
|
|
219
196
|
config['vocab_size'] = config['hash_size']
|
|
220
197
|
|
|
221
|
-
def _process_sparse_feature_transform(
|
|
222
|
-
self,
|
|
223
|
-
data: pd.Series,
|
|
224
|
-
config: Dict[str, Any]
|
|
225
|
-
) -> np.ndarray:
|
|
226
|
-
"""Fast path sparse feature transform using cached dict mapping or hashing."""
|
|
198
|
+
def _process_sparse_feature_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
|
|
227
199
|
name = str(data.name)
|
|
228
200
|
encode_method = config['encode_method']
|
|
229
201
|
fill_na = config['fill_na']
|
|
230
|
-
|
|
231
202
|
sparse_series = pd.Series(data, name=name).fillna(fill_na).astype(str)
|
|
232
|
-
|
|
233
203
|
if encode_method == 'label':
|
|
234
204
|
le = self.label_encoders.get(name)
|
|
235
205
|
if le is None:
|
|
236
206
|
raise ValueError(f"LabelEncoder for {name} not fitted")
|
|
237
|
-
|
|
238
207
|
class_to_idx = config.get('_class_to_idx')
|
|
239
208
|
if class_to_idx is None:
|
|
240
209
|
class_to_idx = {cls: idx for idx, cls in enumerate(le.classes_)}
|
|
241
210
|
config['_class_to_idx'] = class_to_idx
|
|
242
|
-
|
|
243
211
|
encoded = sparse_series.map(class_to_idx)
|
|
244
212
|
encoded = encoded.fillna(0).astype(np.int64)
|
|
245
213
|
return encoded.to_numpy()
|
|
246
|
-
|
|
247
214
|
if encode_method == 'hash':
|
|
248
215
|
hash_size = config['hash_size']
|
|
249
216
|
hash_fn = self._hash_string
|
|
250
|
-
return np.fromiter(
|
|
251
|
-
(hash_fn(v, hash_size) for v in sparse_series.to_numpy()),
|
|
252
|
-
dtype=np.int64,
|
|
253
|
-
count=sparse_series.size,
|
|
254
|
-
)
|
|
255
|
-
|
|
217
|
+
return np.fromiter((hash_fn(v, hash_size) for v in sparse_series.to_numpy()), dtype=np.int64, count=sparse_series.size,)
|
|
256
218
|
return np.array([], dtype=np.int64)
|
|
257
219
|
|
|
258
220
|
def _process_sequence_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
|
|
259
|
-
|
|
260
221
|
name = str(data.name)
|
|
261
222
|
encode_method = config['encode_method']
|
|
262
223
|
separator = config['separator']
|
|
263
|
-
|
|
264
224
|
if encode_method == 'label':
|
|
265
225
|
all_tokens = set()
|
|
266
226
|
for seq in data:
|
|
@@ -280,12 +240,9 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
280
240
|
tokens = [str(t) for t in seq.tolist()]
|
|
281
241
|
else:
|
|
282
242
|
continue
|
|
283
|
-
|
|
284
243
|
all_tokens.update(tokens)
|
|
285
|
-
|
|
286
244
|
if len(all_tokens) == 0:
|
|
287
245
|
all_tokens.add('<PAD>')
|
|
288
|
-
|
|
289
246
|
le = LabelEncoder()
|
|
290
247
|
le.fit(list(all_tokens))
|
|
291
248
|
self.label_encoders[name] = le
|
|
@@ -293,11 +250,7 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
293
250
|
elif encode_method == 'hash':
|
|
294
251
|
config['vocab_size'] = config['hash_size']
|
|
295
252
|
|
|
296
|
-
def _process_sequence_feature_transform(
|
|
297
|
-
self,
|
|
298
|
-
data: pd.Series,
|
|
299
|
-
config: Dict[str, Any]
|
|
300
|
-
) -> np.ndarray:
|
|
253
|
+
def _process_sequence_feature_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
|
|
301
254
|
"""Optimized sequence transform with preallocation and cached vocab map."""
|
|
302
255
|
name = str(data.name)
|
|
303
256
|
encode_method = config['encode_method']
|
|
@@ -305,15 +258,12 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
305
258
|
pad_value = config['pad_value']
|
|
306
259
|
truncate = config['truncate']
|
|
307
260
|
separator = config['separator']
|
|
308
|
-
|
|
309
261
|
arr = np.asarray(data, dtype=object)
|
|
310
262
|
n = arr.shape[0]
|
|
311
263
|
output = np.full((n, max_len), pad_value, dtype=np.int64)
|
|
312
|
-
|
|
313
264
|
# Shared helpers cached locally for speed and cross-platform consistency
|
|
314
265
|
split_fn = str.split
|
|
315
266
|
is_nan = np.isnan
|
|
316
|
-
|
|
317
267
|
if encode_method == 'label':
|
|
318
268
|
le = self.label_encoders.get(name)
|
|
319
269
|
if le is None:
|
|
@@ -324,10 +274,8 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
324
274
|
config['_class_to_idx'] = class_to_idx
|
|
325
275
|
else:
|
|
326
276
|
class_to_idx = None # type: ignore
|
|
327
|
-
|
|
328
277
|
hash_fn = self._hash_string
|
|
329
278
|
hash_size = config.get('hash_size')
|
|
330
|
-
|
|
331
279
|
for i, seq in enumerate(arr):
|
|
332
280
|
# normalize sequence to a list of strings
|
|
333
281
|
tokens = []
|
|
@@ -342,14 +290,12 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
342
290
|
tokens = [str(t) for t in seq]
|
|
343
291
|
else:
|
|
344
292
|
tokens = []
|
|
345
|
-
|
|
346
293
|
if encode_method == 'label':
|
|
347
294
|
encoded = [
|
|
348
295
|
class_to_idx.get(token.strip(), 0) # type: ignore[union-attr]
|
|
349
296
|
for token in tokens
|
|
350
297
|
if token is not None and token != ''
|
|
351
298
|
]
|
|
352
|
-
|
|
353
299
|
elif encode_method == 'hash':
|
|
354
300
|
if hash_size is None:
|
|
355
301
|
raise ValueError("hash_size must be set for hash encoding")
|
|
@@ -360,27 +306,21 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
360
306
|
]
|
|
361
307
|
else:
|
|
362
308
|
encoded = []
|
|
363
|
-
|
|
364
309
|
if not encoded:
|
|
365
310
|
continue
|
|
366
|
-
|
|
367
311
|
if len(encoded) > max_len:
|
|
368
312
|
encoded = encoded[-max_len:] if truncate == 'pre' else encoded[:max_len]
|
|
369
|
-
|
|
370
313
|
output[i, : len(encoded)] = encoded
|
|
371
|
-
|
|
372
314
|
return output
|
|
373
315
|
|
|
374
316
|
def _process_target_fit(self, data: pd.Series, config: Dict[str, Any]):
|
|
375
317
|
name = str(data.name)
|
|
376
318
|
target_type = config['target_type']
|
|
377
|
-
label_map = config
|
|
378
|
-
|
|
319
|
+
label_map = config.get('label_map')
|
|
379
320
|
if target_type in ['binary', 'multiclass']:
|
|
380
321
|
if label_map is None:
|
|
381
322
|
unique_values = data.dropna().unique()
|
|
382
323
|
sorted_values = sorted(unique_values)
|
|
383
|
-
|
|
384
324
|
try:
|
|
385
325
|
int_values = [int(v) for v in sorted_values]
|
|
386
326
|
if int_values == list(range(len(int_values))):
|
|
@@ -389,29 +329,20 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
389
329
|
label_map = {str(val): idx for idx, val in enumerate(sorted_values)}
|
|
390
330
|
except (ValueError, TypeError):
|
|
391
331
|
label_map = {str(val): idx for idx, val in enumerate(sorted_values)}
|
|
392
|
-
|
|
393
332
|
config['label_map'] = label_map
|
|
394
|
-
|
|
395
333
|
self.target_encoders[name] = label_map
|
|
396
334
|
|
|
397
|
-
def _process_target_transform(
|
|
398
|
-
self,
|
|
399
|
-
data: pd.Series,
|
|
400
|
-
config: Dict[str, Any]
|
|
401
|
-
) -> np.ndarray:
|
|
335
|
+
def _process_target_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
|
|
402
336
|
logger = logging.getLogger()
|
|
403
|
-
|
|
404
337
|
name = str(data.name)
|
|
405
|
-
target_type = config
|
|
406
|
-
|
|
338
|
+
target_type = config.get('target_type')
|
|
407
339
|
if target_type == 'regression':
|
|
408
340
|
values = np.array(data.values, dtype=np.float32)
|
|
409
341
|
return values
|
|
410
342
|
else:
|
|
411
343
|
label_map = self.target_encoders.get(name)
|
|
412
344
|
if label_map is None:
|
|
413
|
-
raise ValueError(f"Target encoder for {name} not fitted")
|
|
414
|
-
|
|
345
|
+
raise ValueError(f"Target encoder for {name} not fitted")
|
|
415
346
|
result = []
|
|
416
347
|
for val in data:
|
|
417
348
|
str_val = str(val)
|
|
@@ -420,7 +351,6 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
420
351
|
else:
|
|
421
352
|
logger.warning(f"Unknown target value: {val}, mapping to 0")
|
|
422
353
|
result.append(0)
|
|
423
|
-
|
|
424
354
|
return np.array(result, dtype=np.int64 if target_type == 'multiclass' else np.float32)
|
|
425
355
|
|
|
426
356
|
def _load_dataframe_from_path(self, path: str) -> pd.DataFrame:
|
|
@@ -458,13 +388,10 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
458
388
|
"max": -np.inf,
|
|
459
389
|
"max_abs": 0.0,
|
|
460
390
|
}
|
|
461
|
-
|
|
462
391
|
sparse_vocab: Dict[str, set[str]] = {name: set() for name in self.sparse_features.keys()}
|
|
463
392
|
seq_vocab: Dict[str, set[str]] = {name: set() for name in self.sequence_features.keys()}
|
|
464
393
|
target_values: Dict[str, set[Any]] = {name: set() for name in self.target_features.keys()}
|
|
465
|
-
|
|
466
394
|
missing_features = set()
|
|
467
|
-
|
|
468
395
|
for file_path in file_paths:
|
|
469
396
|
for chunk in iter_file_chunks(file_path, file_type, chunk_size):
|
|
470
397
|
# numeric features
|
|
@@ -514,25 +441,19 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
514
441
|
continue
|
|
515
442
|
vals = chunk[name].dropna().tolist()
|
|
516
443
|
target_values[name].update(vals)
|
|
517
|
-
|
|
518
444
|
if missing_features:
|
|
519
|
-
logger.warning(
|
|
520
|
-
f"The following configured features were not found in provided files: {sorted(missing_features)}"
|
|
521
|
-
)
|
|
522
|
-
|
|
445
|
+
logger.warning(f"The following configured features were not found in provided files: {sorted(missing_features)}")
|
|
523
446
|
# finalize numeric scalers
|
|
524
447
|
for name, config in self.numeric_features.items():
|
|
525
448
|
acc = numeric_acc[name]
|
|
526
449
|
if acc["count"] == 0:
|
|
527
450
|
logger.warning(f"Numeric feature {name} has no valid values in provided files")
|
|
528
451
|
continue
|
|
529
|
-
|
|
530
452
|
mean_val = acc["sum"] / acc["count"]
|
|
531
453
|
if config["fill_na"] is not None:
|
|
532
454
|
config["fill_na_value"] = config["fill_na"]
|
|
533
455
|
else:
|
|
534
456
|
config["fill_na_value"] = mean_val
|
|
535
|
-
|
|
536
457
|
scaler_type = config["scaler"]
|
|
537
458
|
if scaler_type == "standard":
|
|
538
459
|
var = max(acc["sumsq"] / acc["count"] - mean_val * mean_val, 0.0)
|
|
@@ -550,6 +471,11 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
550
471
|
scaler.data_max_ = np.array([data_max], dtype=np.float64)
|
|
551
472
|
scaler.data_range_ = scaler.data_max_ - scaler.data_min_
|
|
552
473
|
scaler.data_range_[scaler.data_range_ == 0] = 1.0
|
|
474
|
+
# Manually set scale_/min_ for streaming fit to mirror sklearn's internal fit logic
|
|
475
|
+
feature_min, feature_max = scaler.feature_range
|
|
476
|
+
scale = (feature_max - feature_min) / scaler.data_range_
|
|
477
|
+
scaler.scale_ = scale
|
|
478
|
+
scaler.min_ = feature_min - scaler.data_min_ * scale
|
|
553
479
|
scaler.n_samples_seen_ = np.array([int(acc["count"])], dtype=np.int64)
|
|
554
480
|
self.scalers[name] = scaler
|
|
555
481
|
elif scaler_type == "maxabs":
|
|
@@ -626,9 +552,9 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
626
552
|
return_dict: bool,
|
|
627
553
|
persist: bool,
|
|
628
554
|
save_format: Optional[Literal["csv", "parquet"]],
|
|
555
|
+
output_path: Optional[str],
|
|
629
556
|
) -> Union[pd.DataFrame, Dict[str, np.ndarray]]:
|
|
630
|
-
logger = logging.getLogger()
|
|
631
|
-
|
|
557
|
+
logger = logging.getLogger()
|
|
632
558
|
# Convert input to dict format for unified processing
|
|
633
559
|
if isinstance(data, pd.DataFrame):
|
|
634
560
|
data_dict = {col: data[col] for col in data.columns}
|
|
@@ -692,169 +618,128 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
692
618
|
else:
|
|
693
619
|
columns_dict[key] = value
|
|
694
620
|
return pd.DataFrame(columns_dict)
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
621
|
+
|
|
622
|
+
if save_format not in [None, "csv", "parquet"]:
|
|
623
|
+
raise ValueError("save_format must be either 'csv', 'parquet', or None")
|
|
624
|
+
effective_format = save_format
|
|
625
|
+
if persist:
|
|
626
|
+
effective_format = save_format or "parquet"
|
|
700
627
|
result_df = None
|
|
701
|
-
if (not return_dict) or
|
|
628
|
+
if (not return_dict) or persist:
|
|
702
629
|
result_df = _dict_to_dataframe(result_dict)
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
if save_format == "parquet":
|
|
630
|
+
if persist:
|
|
631
|
+
if output_path is None:
|
|
632
|
+
raise ValueError("output_path must be provided when persisting transformed data.")
|
|
633
|
+
output_dir = Path(output_path)
|
|
634
|
+
if output_dir.suffix:
|
|
635
|
+
output_dir = output_dir.parent
|
|
636
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
637
|
+
save_path = output_dir / f"transformed_data.{effective_format}"
|
|
638
|
+
assert result_df is not None, "DataFrame conversion failed"
|
|
639
|
+
if effective_format == "parquet":
|
|
715
640
|
result_df.to_parquet(save_path, index=False)
|
|
716
641
|
else:
|
|
717
642
|
result_df.to_csv(save_path, index=False)
|
|
718
|
-
|
|
719
|
-
logger.info(colorize(
|
|
720
|
-
f"Transformed data saved to: {save_path}",
|
|
721
|
-
color="green"
|
|
722
|
-
))
|
|
723
|
-
|
|
643
|
+
logger.info(colorize(f"Transformed data saved to: {save_path.resolve()}", color="green"))
|
|
724
644
|
if return_dict:
|
|
725
645
|
return result_dict
|
|
646
|
+
assert result_df is not None, "DataFrame is None after transform"
|
|
726
647
|
return result_df
|
|
727
648
|
|
|
728
|
-
def _transform_path(
|
|
649
|
+
def _transform_path(
|
|
650
|
+
self,
|
|
651
|
+
input_path: str,
|
|
652
|
+
output_path: Optional[str],
|
|
653
|
+
save_format: Optional[Literal["csv", "parquet"]],
|
|
654
|
+
) -> list[str]:
|
|
729
655
|
"""Transform data from files under a path and save them to a new location."""
|
|
730
656
|
logger = logging.getLogger()
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
if not output_path_obj.is_absolute():
|
|
740
|
-
output_path_obj = self.session_dir / output_path_obj
|
|
741
|
-
if output_path_obj.suffix.lower() in {".csv", ".parquet"}:
|
|
742
|
-
if len(file_paths) != 1:
|
|
743
|
-
raise ValueError("output_path points to a file but multiple input files were provided.")
|
|
744
|
-
target_file_override = output_path_obj
|
|
745
|
-
output_root = output_path_obj.parent
|
|
746
|
-
else:
|
|
747
|
-
output_root = output_path_obj
|
|
748
|
-
|
|
657
|
+
file_paths, file_type = resolve_file_paths(input_path)
|
|
658
|
+
target_format = save_format or file_type
|
|
659
|
+
if target_format not in ["csv", "parquet"]:
|
|
660
|
+
raise ValueError("save_format must be either 'csv' or 'parquet'")
|
|
661
|
+
base_output_dir = Path(output_path) if output_path else default_output_dir(input_path)
|
|
662
|
+
if base_output_dir.suffix:
|
|
663
|
+
base_output_dir = base_output_dir.parent
|
|
664
|
+
output_root = base_output_dir / "transformed_data"
|
|
749
665
|
output_root.mkdir(parents=True, exist_ok=True)
|
|
750
|
-
|
|
751
|
-
saved_paths: list[str] = []
|
|
666
|
+
saved_paths = []
|
|
752
667
|
for file_path in file_paths:
|
|
753
668
|
df = read_table(file_path, file_type)
|
|
754
|
-
|
|
755
669
|
transformed_df = self._transform_in_memory(
|
|
756
670
|
df,
|
|
757
671
|
return_dict=False,
|
|
758
672
|
persist=False,
|
|
759
673
|
save_format=None,
|
|
674
|
+
output_path=None,
|
|
760
675
|
)
|
|
761
676
|
assert isinstance(transformed_df, pd.DataFrame), "Expected DataFrame when return_dict=False"
|
|
762
|
-
|
|
763
677
|
source_path = Path(file_path)
|
|
764
|
-
target_file =
|
|
765
|
-
|
|
766
|
-
if target_file_override is not None
|
|
767
|
-
else output_root / f"{source_path.stem}_preprocessed{source_path.suffix}"
|
|
768
|
-
)
|
|
769
|
-
|
|
770
|
-
if file_type == "csv":
|
|
678
|
+
target_file = output_root / f"{source_path.stem}.{target_format}"
|
|
679
|
+
if target_format == "csv":
|
|
771
680
|
transformed_df.to_csv(target_file, index=False)
|
|
772
681
|
else:
|
|
773
682
|
transformed_df.to_parquet(target_file, index=False)
|
|
774
|
-
|
|
775
683
|
saved_paths.append(str(target_file.resolve()))
|
|
776
|
-
|
|
777
|
-
logger.info(colorize(
|
|
778
|
-
f"Transformed {len(saved_paths)} file(s) saved to: {output_root.resolve()}",
|
|
779
|
-
color="green",
|
|
780
|
-
))
|
|
684
|
+
logger.info(colorize(f"Transformed {len(saved_paths)} file(s) saved to: {output_root.resolve()}", color="green",))
|
|
781
685
|
return saved_paths
|
|
782
686
|
|
|
783
687
|
# fit is nothing but registering the statistics from data so that we can transform the data later
|
|
784
|
-
def fit(
|
|
785
|
-
self,
|
|
786
|
-
data: Union[pd.DataFrame, Dict[str, Any], str, os.PathLike],
|
|
787
|
-
chunk_size: int = 200000,
|
|
788
|
-
):
|
|
688
|
+
def fit(self, data: Union[pd.DataFrame, Dict[str, Any], str, os.PathLike],chunk_size: int = 200000,):
|
|
789
689
|
logger = logging.getLogger()
|
|
790
|
-
|
|
791
690
|
if isinstance(data, (str, os.PathLike)):
|
|
792
691
|
path_str = str(data)
|
|
793
692
|
uses_robust = any(cfg.get("scaler") == "robust" for cfg in self.numeric_features.values())
|
|
794
693
|
if uses_robust:
|
|
795
|
-
logger.warning(
|
|
796
|
-
"Robust scaler requires full data; loading all files into memory. "
|
|
797
|
-
"Consider smaller chunk_size or different scaler if memory is limited."
|
|
798
|
-
)
|
|
694
|
+
logger.warning("Robust scaler requires full data; loading all files into memory. Consider smaller chunk_size or different scaler if memory is limited.")
|
|
799
695
|
data = self._load_dataframe_from_path(path_str)
|
|
800
696
|
else:
|
|
801
697
|
return self._fit_from_path(path_str, chunk_size)
|
|
802
698
|
if isinstance(data, dict):
|
|
803
699
|
data = pd.DataFrame(data)
|
|
804
|
-
|
|
805
700
|
logger.info(colorize("Fitting DataProcessor...", color="cyan", bold=True))
|
|
806
|
-
|
|
807
701
|
for name, config in self.numeric_features.items():
|
|
808
702
|
if name not in data.columns:
|
|
809
703
|
logger.warning(f"Numeric feature {name} not found in data")
|
|
810
704
|
continue
|
|
811
705
|
self._process_numeric_feature_fit(data[name], config)
|
|
812
|
-
|
|
813
706
|
for name, config in self.sparse_features.items():
|
|
814
707
|
if name not in data.columns:
|
|
815
708
|
logger.warning(f"Sparse feature {name} not found in data")
|
|
816
709
|
continue
|
|
817
710
|
self._process_sparse_feature_fit(data[name], config)
|
|
818
|
-
|
|
819
711
|
for name, config in self.sequence_features.items():
|
|
820
712
|
if name not in data.columns:
|
|
821
713
|
logger.warning(f"Sequence feature {name} not found in data")
|
|
822
714
|
continue
|
|
823
715
|
self._process_sequence_feature_fit(data[name], config)
|
|
824
|
-
|
|
825
716
|
for name, config in self.target_features.items():
|
|
826
717
|
if name not in data.columns:
|
|
827
718
|
logger.warning(f"Target {name} not found in data")
|
|
828
719
|
continue
|
|
829
720
|
self._process_target_fit(data[name], config)
|
|
830
|
-
|
|
831
721
|
self.is_fitted = True
|
|
832
|
-
logger.info(colorize("DataProcessor fitted successfully", color="green", bold=True))
|
|
833
722
|
return self
|
|
834
723
|
|
|
835
724
|
def transform(
|
|
836
725
|
self,
|
|
837
726
|
data: Union[pd.DataFrame, Dict[str, Any], str, os.PathLike],
|
|
838
727
|
return_dict: bool = True,
|
|
839
|
-
persist: bool = False,
|
|
840
728
|
save_format: Optional[Literal["csv", "parquet"]] = None,
|
|
841
729
|
output_path: Optional[str] = None,
|
|
842
730
|
) -> Union[pd.DataFrame, Dict[str, np.ndarray], list[str]]:
|
|
843
|
-
logger = logging.getLogger()
|
|
844
|
-
|
|
845
731
|
if not self.is_fitted:
|
|
846
732
|
raise ValueError("DataProcessor must be fitted before transform")
|
|
847
|
-
|
|
848
733
|
if isinstance(data, (str, os.PathLike)):
|
|
849
|
-
if return_dict
|
|
850
|
-
raise ValueError("Path transform writes files only;
|
|
851
|
-
return self._transform_path(str(data), output_path)
|
|
852
|
-
|
|
734
|
+
if return_dict:
|
|
735
|
+
raise ValueError("Path transform writes files only; set return_dict=False when passing a path.")
|
|
736
|
+
return self._transform_path(str(data), output_path, save_format)
|
|
853
737
|
return self._transform_in_memory(
|
|
854
738
|
data=data,
|
|
855
739
|
return_dict=return_dict,
|
|
856
|
-
persist=
|
|
740
|
+
persist=output_path is not None,
|
|
857
741
|
save_format=save_format,
|
|
742
|
+
output_path=output_path,
|
|
858
743
|
)
|
|
859
744
|
|
|
860
745
|
def fit_transform(
|
|
@@ -872,21 +757,20 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
872
757
|
save_format=save_format,
|
|
873
758
|
output_path=output_path,
|
|
874
759
|
)
|
|
875
|
-
|
|
876
|
-
def save(self, save_path: str):
|
|
877
|
-
logger = logging.getLogger()
|
|
878
760
|
|
|
761
|
+
def save(self, save_path: str | Path):
|
|
762
|
+
logger = logging.getLogger()
|
|
763
|
+
assert isinstance(save_path, (str, Path)), "save_path must be a string or Path"
|
|
764
|
+
save_path = Path(save_path)
|
|
879
765
|
if not self.is_fitted:
|
|
880
766
|
logger.warning("Saving unfitted DataProcessor")
|
|
881
|
-
|
|
882
767
|
target_path = resolve_save_path(
|
|
883
768
|
path=save_path,
|
|
884
|
-
default_dir=
|
|
885
|
-
default_name="
|
|
769
|
+
default_dir=Path(os.getcwd()),
|
|
770
|
+
default_name="fitted_processor",
|
|
886
771
|
suffix=".pkl",
|
|
772
|
+
add_timestamp=True
|
|
887
773
|
)
|
|
888
|
-
|
|
889
|
-
# Prepare state dict
|
|
890
774
|
state = {
|
|
891
775
|
"numeric_features": self.numeric_features,
|
|
892
776
|
"sparse_features": self.sparse_features,
|
|
@@ -896,43 +780,37 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
896
780
|
"scalers": self.scalers,
|
|
897
781
|
"label_encoders": self.label_encoders,
|
|
898
782
|
"target_encoders": self.target_encoders,
|
|
783
|
+
"processor_version": __version__,
|
|
899
784
|
}
|
|
900
|
-
|
|
901
|
-
# Save with pickle
|
|
902
785
|
with open(target_path, "wb") as f:
|
|
903
786
|
pickle.dump(state, f)
|
|
904
|
-
|
|
905
|
-
logger.info(colorize(f"DataProcessor saved to: {target_path}", color="green"))
|
|
787
|
+
logger.info(f"DataProcessor saved to: {target_path}, NextRec version: {self.version}")
|
|
906
788
|
|
|
907
789
|
@classmethod
|
|
908
|
-
def load(cls, load_path: str) -> 'DataProcessor':
|
|
790
|
+
def load(cls, load_path: str | Path) -> 'DataProcessor':
|
|
909
791
|
logger = logging.getLogger()
|
|
910
|
-
|
|
792
|
+
load_path = Path(load_path)
|
|
911
793
|
with open(load_path, 'rb') as f:
|
|
912
794
|
state = pickle.load(f)
|
|
913
|
-
|
|
914
795
|
processor = cls()
|
|
915
|
-
processor.numeric_features = state
|
|
916
|
-
processor.sparse_features = state
|
|
917
|
-
processor.sequence_features = state
|
|
918
|
-
processor.target_features = state
|
|
919
|
-
processor.is_fitted = state
|
|
920
|
-
processor.scalers = state
|
|
921
|
-
processor.label_encoders = state
|
|
922
|
-
processor.target_encoders = state
|
|
923
|
-
|
|
924
|
-
logger.info(f"DataProcessor loaded from {load_path}")
|
|
796
|
+
processor.numeric_features = state.get('numeric_features', {})
|
|
797
|
+
processor.sparse_features = state.get('sparse_features', {})
|
|
798
|
+
processor.sequence_features = state.get('sequence_features', {})
|
|
799
|
+
processor.target_features = state.get('target_features', {})
|
|
800
|
+
processor.is_fitted = state.get('is_fitted', False)
|
|
801
|
+
processor.scalers = state.get('scalers', {})
|
|
802
|
+
processor.label_encoders = state.get('label_encoders', {})
|
|
803
|
+
processor.target_encoders = state.get('target_encoders', {})
|
|
804
|
+
processor.version = state.get("processor_version", "unknown")
|
|
805
|
+
logger.info(f"DataProcessor loaded from {load_path}, NextRec version: {processor.version}")
|
|
925
806
|
return processor
|
|
926
807
|
|
|
927
808
|
def get_vocab_sizes(self) -> Dict[str, int]:
|
|
928
809
|
vocab_sizes = {}
|
|
929
|
-
|
|
930
810
|
for name, config in self.sparse_features.items():
|
|
931
811
|
vocab_sizes[name] = config.get('vocab_size', 0)
|
|
932
|
-
|
|
933
812
|
for name, config in self.sequence_features.items():
|
|
934
813
|
vocab_sizes[name] = config.get('vocab_size', 0)
|
|
935
|
-
|
|
936
814
|
return vocab_sizes
|
|
937
815
|
|
|
938
816
|
def summary(self):
|