nextrec 0.2.4__py3-none-any.whl → 0.2.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nextrec/__version__.py +1 -1
- nextrec/basic/features.py +5 -1
- nextrec/basic/layers.py +3 -7
- nextrec/basic/model.py +495 -664
- nextrec/data/data_utils.py +44 -12
- nextrec/data/dataloader.py +84 -285
- nextrec/data/preprocessor.py +93 -214
- nextrec/loss/__init__.py +0 -1
- nextrec/loss/loss_utils.py +51 -120
- nextrec/models/multi_task/esmm.py +1 -1
- nextrec/models/ranking/masknet.py +1 -1
- nextrec/utils/__init__.py +4 -1
- nextrec/utils/common.py +16 -0
- {nextrec-0.2.4.dist-info → nextrec-0.2.6.dist-info}/METADATA +2 -2
- {nextrec-0.2.4.dist-info → nextrec-0.2.6.dist-info}/RECORD +17 -16
- {nextrec-0.2.4.dist-info → nextrec-0.2.6.dist-info}/WHEEL +0 -0
- {nextrec-0.2.4.dist-info → nextrec-0.2.6.dist-info}/licenses/LICENSE +0 -0
nextrec/data/preprocessor.py
CHANGED
|
@@ -30,8 +30,10 @@ from nextrec.data.data_utils import (
|
|
|
30
30
|
load_dataframes,
|
|
31
31
|
default_output_dir,
|
|
32
32
|
)
|
|
33
|
-
from nextrec.basic.session import
|
|
33
|
+
from nextrec.basic.session import resolve_save_path
|
|
34
34
|
from nextrec.basic.features import FeatureSpecMixin
|
|
35
|
+
from nextrec.__version__ import __version__
|
|
36
|
+
|
|
35
37
|
|
|
36
38
|
class DataProcessor(FeatureSpecMixin):
|
|
37
39
|
"""DataProcessor for data preprocessing including numeric, sparse, sequence features and target processing.
|
|
@@ -54,28 +56,21 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
54
56
|
>>> # Get vocabulary sizes for embedding layers
|
|
55
57
|
>>> vocab_sizes = processor.get_vocab_sizes()
|
|
56
58
|
"""
|
|
57
|
-
def __init__(self
|
|
59
|
+
def __init__(self):
|
|
58
60
|
self.numeric_features: Dict[str, Dict[str, Any]] = {}
|
|
59
61
|
self.sparse_features: Dict[str, Dict[str, Any]] = {}
|
|
60
62
|
self.sequence_features: Dict[str, Dict[str, Any]] = {}
|
|
61
63
|
self.target_features: Dict[str, Dict[str, Any]] = {}
|
|
62
|
-
self.
|
|
63
|
-
|
|
64
|
-
|
|
64
|
+
self.version = __version__
|
|
65
|
+
|
|
65
66
|
self.is_fitted = False
|
|
66
67
|
self._transform_summary_printed = False # Track if summary has been printed during transform
|
|
67
68
|
|
|
68
69
|
self.scalers: Dict[str, Any] = {}
|
|
69
70
|
self.label_encoders: Dict[str, LabelEncoder] = {}
|
|
70
71
|
self.target_encoders: Dict[str, Dict[str, int]] = {}
|
|
71
|
-
self.
|
|
72
|
-
|
|
73
|
-
# Initialize logger if not already initialized
|
|
74
|
-
self._logger_initialized = False
|
|
75
|
-
if not logging.getLogger().hasHandlers():
|
|
76
|
-
setup_logger(session_id=self.session_id)
|
|
77
|
-
self._logger_initialized = True
|
|
78
|
-
|
|
72
|
+
self._set_target_id_config([], [])
|
|
73
|
+
|
|
79
74
|
def add_numeric_feature(
|
|
80
75
|
self,
|
|
81
76
|
name: str,
|
|
@@ -96,7 +91,6 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
96
91
|
):
|
|
97
92
|
if encode_method == 'hash' and hash_size is None:
|
|
98
93
|
raise ValueError("hash_size must be specified when encode_method='hash'")
|
|
99
|
-
|
|
100
94
|
self.sparse_features[name] = {
|
|
101
95
|
'encode_method': encode_method,
|
|
102
96
|
'hash_size': hash_size,
|
|
@@ -113,10 +107,8 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
113
107
|
truncate: Literal['pre', 'post'] = 'pre', # pre: keep last max_len items, post: keep first max_len items
|
|
114
108
|
separator: str = ','
|
|
115
109
|
):
|
|
116
|
-
|
|
117
110
|
if encode_method == 'hash' and hash_size is None:
|
|
118
111
|
raise ValueError("hash_size must be specified when encode_method='hash'")
|
|
119
|
-
|
|
120
112
|
self.sequence_features[name] = {
|
|
121
113
|
'encode_method': encode_method,
|
|
122
114
|
'hash_size': hash_size,
|
|
@@ -136,23 +128,20 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
136
128
|
'target_type': target_type,
|
|
137
129
|
'label_map': label_map
|
|
138
130
|
}
|
|
139
|
-
self.
|
|
131
|
+
self._set_target_id_config(list(self.target_features.keys()), [])
|
|
140
132
|
|
|
141
133
|
def _hash_string(self, s: str, hash_size: int) -> int:
|
|
142
134
|
return int(hashlib.md5(str(s).encode()).hexdigest(), 16) % hash_size
|
|
143
135
|
|
|
144
136
|
def _process_numeric_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
|
|
145
|
-
|
|
146
137
|
name = str(data.name)
|
|
147
138
|
scaler_type = config['scaler']
|
|
148
139
|
fill_na = config['fill_na']
|
|
149
|
-
|
|
150
140
|
if data.isna().any():
|
|
151
141
|
if fill_na is None:
|
|
152
142
|
# Default use mean value to fill missing values for numeric features
|
|
153
143
|
fill_na = data.mean()
|
|
154
144
|
config['fill_na_value'] = fill_na
|
|
155
|
-
|
|
156
145
|
if scaler_type == 'standard':
|
|
157
146
|
scaler = StandardScaler()
|
|
158
147
|
elif scaler_type == 'minmax':
|
|
@@ -167,27 +156,19 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
167
156
|
scaler = None
|
|
168
157
|
else:
|
|
169
158
|
raise ValueError(f"Unknown scaler type: {scaler_type}")
|
|
170
|
-
|
|
171
159
|
if scaler is not None and scaler_type != 'log':
|
|
172
160
|
filled_data = data.fillna(config.get('fill_na_value', 0))
|
|
173
161
|
values = np.array(filled_data.values, dtype=np.float64).reshape(-1, 1)
|
|
174
162
|
scaler.fit(values)
|
|
175
163
|
self.scalers[name] = scaler
|
|
176
164
|
|
|
177
|
-
def _process_numeric_feature_transform(
|
|
178
|
-
self,
|
|
179
|
-
data: pd.Series,
|
|
180
|
-
config: Dict[str, Any]
|
|
181
|
-
) -> np.ndarray:
|
|
165
|
+
def _process_numeric_feature_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
|
|
182
166
|
logger = logging.getLogger()
|
|
183
|
-
|
|
184
167
|
name = str(data.name)
|
|
185
168
|
scaler_type = config['scaler']
|
|
186
169
|
fill_na_value = config.get('fill_na_value', 0)
|
|
187
|
-
|
|
188
170
|
filled_data = data.fillna(fill_na_value)
|
|
189
171
|
values = np.array(filled_data.values, dtype=np.float64)
|
|
190
|
-
|
|
191
172
|
if scaler_type == 'log':
|
|
192
173
|
result = np.log1p(np.maximum(values, 0))
|
|
193
174
|
elif scaler_type == 'none':
|
|
@@ -199,17 +180,13 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
199
180
|
result = values
|
|
200
181
|
else:
|
|
201
182
|
result = scaler.transform(values.reshape(-1, 1)).ravel()
|
|
202
|
-
|
|
203
183
|
return result
|
|
204
184
|
|
|
205
185
|
def _process_sparse_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
|
|
206
|
-
|
|
207
186
|
name = str(data.name)
|
|
208
187
|
encode_method = config['encode_method']
|
|
209
188
|
fill_na = config['fill_na'] # <UNK>
|
|
210
|
-
|
|
211
189
|
filled_data = data.fillna(fill_na).astype(str)
|
|
212
|
-
|
|
213
190
|
if encode_method == 'label':
|
|
214
191
|
le = LabelEncoder()
|
|
215
192
|
le.fit(filled_data)
|
|
@@ -218,49 +195,32 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
218
195
|
elif encode_method == 'hash':
|
|
219
196
|
config['vocab_size'] = config['hash_size']
|
|
220
197
|
|
|
221
|
-
def _process_sparse_feature_transform(
|
|
222
|
-
self,
|
|
223
|
-
data: pd.Series,
|
|
224
|
-
config: Dict[str, Any]
|
|
225
|
-
) -> np.ndarray:
|
|
226
|
-
"""Fast path sparse feature transform using cached dict mapping or hashing."""
|
|
198
|
+
def _process_sparse_feature_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
|
|
227
199
|
name = str(data.name)
|
|
228
200
|
encode_method = config['encode_method']
|
|
229
201
|
fill_na = config['fill_na']
|
|
230
|
-
|
|
231
202
|
sparse_series = pd.Series(data, name=name).fillna(fill_na).astype(str)
|
|
232
|
-
|
|
233
203
|
if encode_method == 'label':
|
|
234
204
|
le = self.label_encoders.get(name)
|
|
235
205
|
if le is None:
|
|
236
206
|
raise ValueError(f"LabelEncoder for {name} not fitted")
|
|
237
|
-
|
|
238
207
|
class_to_idx = config.get('_class_to_idx')
|
|
239
208
|
if class_to_idx is None:
|
|
240
209
|
class_to_idx = {cls: idx for idx, cls in enumerate(le.classes_)}
|
|
241
210
|
config['_class_to_idx'] = class_to_idx
|
|
242
|
-
|
|
243
211
|
encoded = sparse_series.map(class_to_idx)
|
|
244
212
|
encoded = encoded.fillna(0).astype(np.int64)
|
|
245
213
|
return encoded.to_numpy()
|
|
246
|
-
|
|
247
214
|
if encode_method == 'hash':
|
|
248
215
|
hash_size = config['hash_size']
|
|
249
216
|
hash_fn = self._hash_string
|
|
250
|
-
return np.fromiter(
|
|
251
|
-
(hash_fn(v, hash_size) for v in sparse_series.to_numpy()),
|
|
252
|
-
dtype=np.int64,
|
|
253
|
-
count=sparse_series.size,
|
|
254
|
-
)
|
|
255
|
-
|
|
217
|
+
return np.fromiter((hash_fn(v, hash_size) for v in sparse_series.to_numpy()), dtype=np.int64, count=sparse_series.size,)
|
|
256
218
|
return np.array([], dtype=np.int64)
|
|
257
219
|
|
|
258
220
|
def _process_sequence_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
|
|
259
|
-
|
|
260
221
|
name = str(data.name)
|
|
261
222
|
encode_method = config['encode_method']
|
|
262
223
|
separator = config['separator']
|
|
263
|
-
|
|
264
224
|
if encode_method == 'label':
|
|
265
225
|
all_tokens = set()
|
|
266
226
|
for seq in data:
|
|
@@ -280,12 +240,9 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
280
240
|
tokens = [str(t) for t in seq.tolist()]
|
|
281
241
|
else:
|
|
282
242
|
continue
|
|
283
|
-
|
|
284
243
|
all_tokens.update(tokens)
|
|
285
|
-
|
|
286
244
|
if len(all_tokens) == 0:
|
|
287
245
|
all_tokens.add('<PAD>')
|
|
288
|
-
|
|
289
246
|
le = LabelEncoder()
|
|
290
247
|
le.fit(list(all_tokens))
|
|
291
248
|
self.label_encoders[name] = le
|
|
@@ -293,11 +250,7 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
293
250
|
elif encode_method == 'hash':
|
|
294
251
|
config['vocab_size'] = config['hash_size']
|
|
295
252
|
|
|
296
|
-
def _process_sequence_feature_transform(
|
|
297
|
-
self,
|
|
298
|
-
data: pd.Series,
|
|
299
|
-
config: Dict[str, Any]
|
|
300
|
-
) -> np.ndarray:
|
|
253
|
+
def _process_sequence_feature_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
|
|
301
254
|
"""Optimized sequence transform with preallocation and cached vocab map."""
|
|
302
255
|
name = str(data.name)
|
|
303
256
|
encode_method = config['encode_method']
|
|
@@ -305,15 +258,12 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
305
258
|
pad_value = config['pad_value']
|
|
306
259
|
truncate = config['truncate']
|
|
307
260
|
separator = config['separator']
|
|
308
|
-
|
|
309
261
|
arr = np.asarray(data, dtype=object)
|
|
310
262
|
n = arr.shape[0]
|
|
311
263
|
output = np.full((n, max_len), pad_value, dtype=np.int64)
|
|
312
|
-
|
|
313
264
|
# Shared helpers cached locally for speed and cross-platform consistency
|
|
314
265
|
split_fn = str.split
|
|
315
266
|
is_nan = np.isnan
|
|
316
|
-
|
|
317
267
|
if encode_method == 'label':
|
|
318
268
|
le = self.label_encoders.get(name)
|
|
319
269
|
if le is None:
|
|
@@ -324,10 +274,8 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
324
274
|
config['_class_to_idx'] = class_to_idx
|
|
325
275
|
else:
|
|
326
276
|
class_to_idx = None # type: ignore
|
|
327
|
-
|
|
328
277
|
hash_fn = self._hash_string
|
|
329
278
|
hash_size = config.get('hash_size')
|
|
330
|
-
|
|
331
279
|
for i, seq in enumerate(arr):
|
|
332
280
|
# normalize sequence to a list of strings
|
|
333
281
|
tokens = []
|
|
@@ -342,14 +290,12 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
342
290
|
tokens = [str(t) for t in seq]
|
|
343
291
|
else:
|
|
344
292
|
tokens = []
|
|
345
|
-
|
|
346
293
|
if encode_method == 'label':
|
|
347
294
|
encoded = [
|
|
348
295
|
class_to_idx.get(token.strip(), 0) # type: ignore[union-attr]
|
|
349
296
|
for token in tokens
|
|
350
297
|
if token is not None and token != ''
|
|
351
298
|
]
|
|
352
|
-
|
|
353
299
|
elif encode_method == 'hash':
|
|
354
300
|
if hash_size is None:
|
|
355
301
|
raise ValueError("hash_size must be set for hash encoding")
|
|
@@ -360,27 +306,21 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
360
306
|
]
|
|
361
307
|
else:
|
|
362
308
|
encoded = []
|
|
363
|
-
|
|
364
309
|
if not encoded:
|
|
365
310
|
continue
|
|
366
|
-
|
|
367
311
|
if len(encoded) > max_len:
|
|
368
312
|
encoded = encoded[-max_len:] if truncate == 'pre' else encoded[:max_len]
|
|
369
|
-
|
|
370
313
|
output[i, : len(encoded)] = encoded
|
|
371
|
-
|
|
372
314
|
return output
|
|
373
315
|
|
|
374
316
|
def _process_target_fit(self, data: pd.Series, config: Dict[str, Any]):
|
|
375
317
|
name = str(data.name)
|
|
376
318
|
target_type = config['target_type']
|
|
377
|
-
label_map = config
|
|
378
|
-
|
|
319
|
+
label_map = config.get('label_map')
|
|
379
320
|
if target_type in ['binary', 'multiclass']:
|
|
380
321
|
if label_map is None:
|
|
381
322
|
unique_values = data.dropna().unique()
|
|
382
323
|
sorted_values = sorted(unique_values)
|
|
383
|
-
|
|
384
324
|
try:
|
|
385
325
|
int_values = [int(v) for v in sorted_values]
|
|
386
326
|
if int_values == list(range(len(int_values))):
|
|
@@ -389,29 +329,20 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
389
329
|
label_map = {str(val): idx for idx, val in enumerate(sorted_values)}
|
|
390
330
|
except (ValueError, TypeError):
|
|
391
331
|
label_map = {str(val): idx for idx, val in enumerate(sorted_values)}
|
|
392
|
-
|
|
393
332
|
config['label_map'] = label_map
|
|
394
|
-
|
|
395
333
|
self.target_encoders[name] = label_map
|
|
396
334
|
|
|
397
|
-
def _process_target_transform(
|
|
398
|
-
self,
|
|
399
|
-
data: pd.Series,
|
|
400
|
-
config: Dict[str, Any]
|
|
401
|
-
) -> np.ndarray:
|
|
335
|
+
def _process_target_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
|
|
402
336
|
logger = logging.getLogger()
|
|
403
|
-
|
|
404
337
|
name = str(data.name)
|
|
405
|
-
target_type = config
|
|
406
|
-
|
|
338
|
+
target_type = config.get('target_type')
|
|
407
339
|
if target_type == 'regression':
|
|
408
340
|
values = np.array(data.values, dtype=np.float32)
|
|
409
341
|
return values
|
|
410
342
|
else:
|
|
411
343
|
label_map = self.target_encoders.get(name)
|
|
412
344
|
if label_map is None:
|
|
413
|
-
raise ValueError(f"Target encoder for {name} not fitted")
|
|
414
|
-
|
|
345
|
+
raise ValueError(f"Target encoder for {name} not fitted")
|
|
415
346
|
result = []
|
|
416
347
|
for val in data:
|
|
417
348
|
str_val = str(val)
|
|
@@ -420,7 +351,6 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
420
351
|
else:
|
|
421
352
|
logger.warning(f"Unknown target value: {val}, mapping to 0")
|
|
422
353
|
result.append(0)
|
|
423
|
-
|
|
424
354
|
return np.array(result, dtype=np.int64 if target_type == 'multiclass' else np.float32)
|
|
425
355
|
|
|
426
356
|
def _load_dataframe_from_path(self, path: str) -> pd.DataFrame:
|
|
@@ -458,13 +388,10 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
458
388
|
"max": -np.inf,
|
|
459
389
|
"max_abs": 0.0,
|
|
460
390
|
}
|
|
461
|
-
|
|
462
391
|
sparse_vocab: Dict[str, set[str]] = {name: set() for name in self.sparse_features.keys()}
|
|
463
392
|
seq_vocab: Dict[str, set[str]] = {name: set() for name in self.sequence_features.keys()}
|
|
464
393
|
target_values: Dict[str, set[Any]] = {name: set() for name in self.target_features.keys()}
|
|
465
|
-
|
|
466
394
|
missing_features = set()
|
|
467
|
-
|
|
468
395
|
for file_path in file_paths:
|
|
469
396
|
for chunk in iter_file_chunks(file_path, file_type, chunk_size):
|
|
470
397
|
# numeric features
|
|
@@ -514,25 +441,19 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
514
441
|
continue
|
|
515
442
|
vals = chunk[name].dropna().tolist()
|
|
516
443
|
target_values[name].update(vals)
|
|
517
|
-
|
|
518
444
|
if missing_features:
|
|
519
|
-
logger.warning(
|
|
520
|
-
f"The following configured features were not found in provided files: {sorted(missing_features)}"
|
|
521
|
-
)
|
|
522
|
-
|
|
445
|
+
logger.warning(f"The following configured features were not found in provided files: {sorted(missing_features)}")
|
|
523
446
|
# finalize numeric scalers
|
|
524
447
|
for name, config in self.numeric_features.items():
|
|
525
448
|
acc = numeric_acc[name]
|
|
526
449
|
if acc["count"] == 0:
|
|
527
450
|
logger.warning(f"Numeric feature {name} has no valid values in provided files")
|
|
528
451
|
continue
|
|
529
|
-
|
|
530
452
|
mean_val = acc["sum"] / acc["count"]
|
|
531
453
|
if config["fill_na"] is not None:
|
|
532
454
|
config["fill_na_value"] = config["fill_na"]
|
|
533
455
|
else:
|
|
534
456
|
config["fill_na_value"] = mean_val
|
|
535
|
-
|
|
536
457
|
scaler_type = config["scaler"]
|
|
537
458
|
if scaler_type == "standard":
|
|
538
459
|
var = max(acc["sumsq"] / acc["count"] - mean_val * mean_val, 0.0)
|
|
@@ -550,6 +471,11 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
550
471
|
scaler.data_max_ = np.array([data_max], dtype=np.float64)
|
|
551
472
|
scaler.data_range_ = scaler.data_max_ - scaler.data_min_
|
|
552
473
|
scaler.data_range_[scaler.data_range_ == 0] = 1.0
|
|
474
|
+
# Manually set scale_/min_ for streaming fit to mirror sklearn's internal fit logic
|
|
475
|
+
feature_min, feature_max = scaler.feature_range
|
|
476
|
+
scale = (feature_max - feature_min) / scaler.data_range_
|
|
477
|
+
scaler.scale_ = scale
|
|
478
|
+
scaler.min_ = feature_min - scaler.data_min_ * scale
|
|
553
479
|
scaler.n_samples_seen_ = np.array([int(acc["count"])], dtype=np.int64)
|
|
554
480
|
self.scalers[name] = scaler
|
|
555
481
|
elif scaler_type == "maxabs":
|
|
@@ -626,9 +552,9 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
626
552
|
return_dict: bool,
|
|
627
553
|
persist: bool,
|
|
628
554
|
save_format: Optional[Literal["csv", "parquet"]],
|
|
555
|
+
output_path: Optional[str],
|
|
629
556
|
) -> Union[pd.DataFrame, Dict[str, np.ndarray]]:
|
|
630
|
-
logger = logging.getLogger()
|
|
631
|
-
|
|
557
|
+
logger = logging.getLogger()
|
|
632
558
|
# Convert input to dict format for unified processing
|
|
633
559
|
if isinstance(data, pd.DataFrame):
|
|
634
560
|
data_dict = {col: data[col] for col in data.columns}
|
|
@@ -688,173 +614,133 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
688
614
|
columns_dict = {}
|
|
689
615
|
for key, value in result.items():
|
|
690
616
|
if key in self.sequence_features:
|
|
691
|
-
|
|
617
|
+
# Use tolist to coerce numpy scalars to native Python ints for stable CSV rendering
|
|
618
|
+
columns_dict[key] = [np.asarray(seq).tolist() for seq in value]
|
|
692
619
|
else:
|
|
693
620
|
columns_dict[key] = value
|
|
694
621
|
return pd.DataFrame(columns_dict)
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
622
|
+
|
|
623
|
+
if save_format not in [None, "csv", "parquet"]:
|
|
624
|
+
raise ValueError("save_format must be either 'csv', 'parquet', or None")
|
|
625
|
+
effective_format = save_format
|
|
626
|
+
if persist:
|
|
627
|
+
effective_format = save_format or "parquet"
|
|
700
628
|
result_df = None
|
|
701
|
-
if (not return_dict) or
|
|
629
|
+
if (not return_dict) or persist:
|
|
702
630
|
result_df = _dict_to_dataframe(result_dict)
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
if save_format == "parquet":
|
|
631
|
+
if persist:
|
|
632
|
+
if output_path is None:
|
|
633
|
+
raise ValueError("output_path must be provided when persisting transformed data.")
|
|
634
|
+
output_dir = Path(output_path)
|
|
635
|
+
if output_dir.suffix:
|
|
636
|
+
output_dir = output_dir.parent
|
|
637
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
638
|
+
save_path = output_dir / f"transformed_data.{effective_format}"
|
|
639
|
+
assert result_df is not None, "DataFrame conversion failed"
|
|
640
|
+
if effective_format == "parquet":
|
|
715
641
|
result_df.to_parquet(save_path, index=False)
|
|
716
642
|
else:
|
|
717
643
|
result_df.to_csv(save_path, index=False)
|
|
718
|
-
|
|
719
|
-
logger.info(colorize(
|
|
720
|
-
f"Transformed data saved to: {save_path}",
|
|
721
|
-
color="green"
|
|
722
|
-
))
|
|
723
|
-
|
|
644
|
+
logger.info(colorize(f"Transformed data saved to: {save_path.resolve()}", color="green"))
|
|
724
645
|
if return_dict:
|
|
725
646
|
return result_dict
|
|
647
|
+
assert result_df is not None, "DataFrame is None after transform"
|
|
726
648
|
return result_df
|
|
727
649
|
|
|
728
|
-
def _transform_path(
|
|
650
|
+
def _transform_path(
|
|
651
|
+
self,
|
|
652
|
+
input_path: str,
|
|
653
|
+
output_path: Optional[str],
|
|
654
|
+
save_format: Optional[Literal["csv", "parquet"]],
|
|
655
|
+
) -> list[str]:
|
|
729
656
|
"""Transform data from files under a path and save them to a new location."""
|
|
730
657
|
logger = logging.getLogger()
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
if not output_path_obj.is_absolute():
|
|
740
|
-
output_path_obj = self.session_dir / output_path_obj
|
|
741
|
-
if output_path_obj.suffix.lower() in {".csv", ".parquet"}:
|
|
742
|
-
if len(file_paths) != 1:
|
|
743
|
-
raise ValueError("output_path points to a file but multiple input files were provided.")
|
|
744
|
-
target_file_override = output_path_obj
|
|
745
|
-
output_root = output_path_obj.parent
|
|
746
|
-
else:
|
|
747
|
-
output_root = output_path_obj
|
|
748
|
-
|
|
658
|
+
file_paths, file_type = resolve_file_paths(input_path)
|
|
659
|
+
target_format = save_format or file_type
|
|
660
|
+
if target_format not in ["csv", "parquet"]:
|
|
661
|
+
raise ValueError("save_format must be either 'csv' or 'parquet'")
|
|
662
|
+
base_output_dir = Path(output_path) if output_path else default_output_dir(input_path)
|
|
663
|
+
if base_output_dir.suffix:
|
|
664
|
+
base_output_dir = base_output_dir.parent
|
|
665
|
+
output_root = base_output_dir / "transformed_data"
|
|
749
666
|
output_root.mkdir(parents=True, exist_ok=True)
|
|
750
|
-
|
|
751
|
-
saved_paths: list[str] = []
|
|
667
|
+
saved_paths = []
|
|
752
668
|
for file_path in file_paths:
|
|
753
669
|
df = read_table(file_path, file_type)
|
|
754
|
-
|
|
755
670
|
transformed_df = self._transform_in_memory(
|
|
756
671
|
df,
|
|
757
672
|
return_dict=False,
|
|
758
673
|
persist=False,
|
|
759
674
|
save_format=None,
|
|
675
|
+
output_path=None,
|
|
760
676
|
)
|
|
761
677
|
assert isinstance(transformed_df, pd.DataFrame), "Expected DataFrame when return_dict=False"
|
|
762
|
-
|
|
763
678
|
source_path = Path(file_path)
|
|
764
|
-
target_file =
|
|
765
|
-
|
|
766
|
-
if target_file_override is not None
|
|
767
|
-
else output_root / f"{source_path.stem}_preprocessed{source_path.suffix}"
|
|
768
|
-
)
|
|
769
|
-
|
|
770
|
-
if file_type == "csv":
|
|
679
|
+
target_file = output_root / f"{source_path.stem}.{target_format}"
|
|
680
|
+
if target_format == "csv":
|
|
771
681
|
transformed_df.to_csv(target_file, index=False)
|
|
772
682
|
else:
|
|
773
683
|
transformed_df.to_parquet(target_file, index=False)
|
|
774
|
-
|
|
775
684
|
saved_paths.append(str(target_file.resolve()))
|
|
776
|
-
|
|
777
|
-
logger.info(colorize(
|
|
778
|
-
f"Transformed {len(saved_paths)} file(s) saved to: {output_root.resolve()}",
|
|
779
|
-
color="green",
|
|
780
|
-
))
|
|
685
|
+
logger.info(colorize(f"Transformed {len(saved_paths)} file(s) saved to: {output_root.resolve()}", color="green",))
|
|
781
686
|
return saved_paths
|
|
782
687
|
|
|
783
688
|
# fit is nothing but registering the statistics from data so that we can transform the data later
|
|
784
|
-
def fit(
|
|
785
|
-
self,
|
|
786
|
-
data: Union[pd.DataFrame, Dict[str, Any], str, os.PathLike],
|
|
787
|
-
chunk_size: int = 200000,
|
|
788
|
-
):
|
|
689
|
+
def fit(self, data: Union[pd.DataFrame, Dict[str, Any], str, os.PathLike],chunk_size: int = 200000,):
|
|
789
690
|
logger = logging.getLogger()
|
|
790
|
-
|
|
791
691
|
if isinstance(data, (str, os.PathLike)):
|
|
792
692
|
path_str = str(data)
|
|
793
693
|
uses_robust = any(cfg.get("scaler") == "robust" for cfg in self.numeric_features.values())
|
|
794
694
|
if uses_robust:
|
|
795
|
-
logger.warning(
|
|
796
|
-
"Robust scaler requires full data; loading all files into memory. "
|
|
797
|
-
"Consider smaller chunk_size or different scaler if memory is limited."
|
|
798
|
-
)
|
|
695
|
+
logger.warning("Robust scaler requires full data; loading all files into memory. Consider smaller chunk_size or different scaler if memory is limited.")
|
|
799
696
|
data = self._load_dataframe_from_path(path_str)
|
|
800
697
|
else:
|
|
801
698
|
return self._fit_from_path(path_str, chunk_size)
|
|
802
699
|
if isinstance(data, dict):
|
|
803
700
|
data = pd.DataFrame(data)
|
|
804
|
-
|
|
805
701
|
logger.info(colorize("Fitting DataProcessor...", color="cyan", bold=True))
|
|
806
|
-
|
|
807
702
|
for name, config in self.numeric_features.items():
|
|
808
703
|
if name not in data.columns:
|
|
809
704
|
logger.warning(f"Numeric feature {name} not found in data")
|
|
810
705
|
continue
|
|
811
706
|
self._process_numeric_feature_fit(data[name], config)
|
|
812
|
-
|
|
813
707
|
for name, config in self.sparse_features.items():
|
|
814
708
|
if name not in data.columns:
|
|
815
709
|
logger.warning(f"Sparse feature {name} not found in data")
|
|
816
710
|
continue
|
|
817
711
|
self._process_sparse_feature_fit(data[name], config)
|
|
818
|
-
|
|
819
712
|
for name, config in self.sequence_features.items():
|
|
820
713
|
if name not in data.columns:
|
|
821
714
|
logger.warning(f"Sequence feature {name} not found in data")
|
|
822
715
|
continue
|
|
823
716
|
self._process_sequence_feature_fit(data[name], config)
|
|
824
|
-
|
|
825
717
|
for name, config in self.target_features.items():
|
|
826
718
|
if name not in data.columns:
|
|
827
719
|
logger.warning(f"Target {name} not found in data")
|
|
828
720
|
continue
|
|
829
721
|
self._process_target_fit(data[name], config)
|
|
830
|
-
|
|
831
722
|
self.is_fitted = True
|
|
832
|
-
logger.info(colorize("DataProcessor fitted successfully", color="green", bold=True))
|
|
833
723
|
return self
|
|
834
724
|
|
|
835
725
|
def transform(
|
|
836
726
|
self,
|
|
837
727
|
data: Union[pd.DataFrame, Dict[str, Any], str, os.PathLike],
|
|
838
728
|
return_dict: bool = True,
|
|
839
|
-
persist: bool = False,
|
|
840
729
|
save_format: Optional[Literal["csv", "parquet"]] = None,
|
|
841
730
|
output_path: Optional[str] = None,
|
|
842
731
|
) -> Union[pd.DataFrame, Dict[str, np.ndarray], list[str]]:
|
|
843
|
-
logger = logging.getLogger()
|
|
844
|
-
|
|
845
732
|
if not self.is_fitted:
|
|
846
733
|
raise ValueError("DataProcessor must be fitted before transform")
|
|
847
|
-
|
|
848
734
|
if isinstance(data, (str, os.PathLike)):
|
|
849
|
-
if return_dict
|
|
850
|
-
raise ValueError("Path transform writes files only;
|
|
851
|
-
return self._transform_path(str(data), output_path)
|
|
852
|
-
|
|
735
|
+
if return_dict:
|
|
736
|
+
raise ValueError("Path transform writes files only; set return_dict=False when passing a path.")
|
|
737
|
+
return self._transform_path(str(data), output_path, save_format)
|
|
853
738
|
return self._transform_in_memory(
|
|
854
739
|
data=data,
|
|
855
740
|
return_dict=return_dict,
|
|
856
|
-
persist=
|
|
741
|
+
persist=output_path is not None,
|
|
857
742
|
save_format=save_format,
|
|
743
|
+
output_path=output_path,
|
|
858
744
|
)
|
|
859
745
|
|
|
860
746
|
def fit_transform(
|
|
@@ -872,21 +758,20 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
872
758
|
save_format=save_format,
|
|
873
759
|
output_path=output_path,
|
|
874
760
|
)
|
|
875
|
-
|
|
876
|
-
def save(self, save_path: str):
|
|
877
|
-
logger = logging.getLogger()
|
|
878
761
|
|
|
762
|
+
def save(self, save_path: str | Path):
|
|
763
|
+
logger = logging.getLogger()
|
|
764
|
+
assert isinstance(save_path, (str, Path)), "save_path must be a string or Path"
|
|
765
|
+
save_path = Path(save_path)
|
|
879
766
|
if not self.is_fitted:
|
|
880
767
|
logger.warning("Saving unfitted DataProcessor")
|
|
881
|
-
|
|
882
768
|
target_path = resolve_save_path(
|
|
883
769
|
path=save_path,
|
|
884
|
-
default_dir=
|
|
885
|
-
default_name="
|
|
770
|
+
default_dir=Path(os.getcwd()),
|
|
771
|
+
default_name="fitted_processor",
|
|
886
772
|
suffix=".pkl",
|
|
773
|
+
add_timestamp=False
|
|
887
774
|
)
|
|
888
|
-
|
|
889
|
-
# Prepare state dict
|
|
890
775
|
state = {
|
|
891
776
|
"numeric_features": self.numeric_features,
|
|
892
777
|
"sparse_features": self.sparse_features,
|
|
@@ -896,43 +781,37 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
896
781
|
"scalers": self.scalers,
|
|
897
782
|
"label_encoders": self.label_encoders,
|
|
898
783
|
"target_encoders": self.target_encoders,
|
|
784
|
+
"processor_version": __version__,
|
|
899
785
|
}
|
|
900
|
-
|
|
901
|
-
# Save with pickle
|
|
902
786
|
with open(target_path, "wb") as f:
|
|
903
787
|
pickle.dump(state, f)
|
|
904
|
-
|
|
905
|
-
logger.info(colorize(f"DataProcessor saved to: {target_path}", color="green"))
|
|
788
|
+
logger.info(f"DataProcessor saved to: {target_path}, NextRec version: {self.version}")
|
|
906
789
|
|
|
907
790
|
@classmethod
|
|
908
|
-
def load(cls, load_path: str) -> 'DataProcessor':
|
|
791
|
+
def load(cls, load_path: str | Path) -> 'DataProcessor':
|
|
909
792
|
logger = logging.getLogger()
|
|
910
|
-
|
|
793
|
+
load_path = Path(load_path)
|
|
911
794
|
with open(load_path, 'rb') as f:
|
|
912
795
|
state = pickle.load(f)
|
|
913
|
-
|
|
914
796
|
processor = cls()
|
|
915
|
-
processor.numeric_features = state
|
|
916
|
-
processor.sparse_features = state
|
|
917
|
-
processor.sequence_features = state
|
|
918
|
-
processor.target_features = state
|
|
919
|
-
processor.is_fitted = state
|
|
920
|
-
processor.scalers = state
|
|
921
|
-
processor.label_encoders = state
|
|
922
|
-
processor.target_encoders = state
|
|
923
|
-
|
|
924
|
-
logger.info(f"DataProcessor loaded from {load_path}")
|
|
797
|
+
processor.numeric_features = state.get('numeric_features', {})
|
|
798
|
+
processor.sparse_features = state.get('sparse_features', {})
|
|
799
|
+
processor.sequence_features = state.get('sequence_features', {})
|
|
800
|
+
processor.target_features = state.get('target_features', {})
|
|
801
|
+
processor.is_fitted = state.get('is_fitted', False)
|
|
802
|
+
processor.scalers = state.get('scalers', {})
|
|
803
|
+
processor.label_encoders = state.get('label_encoders', {})
|
|
804
|
+
processor.target_encoders = state.get('target_encoders', {})
|
|
805
|
+
processor.version = state.get("processor_version", "unknown")
|
|
806
|
+
logger.info(f"DataProcessor loaded from {load_path}, NextRec version: {processor.version}")
|
|
925
807
|
return processor
|
|
926
808
|
|
|
927
809
|
def get_vocab_sizes(self) -> Dict[str, int]:
|
|
928
810
|
vocab_sizes = {}
|
|
929
|
-
|
|
930
811
|
for name, config in self.sparse_features.items():
|
|
931
812
|
vocab_sizes[name] = config.get('vocab_size', 0)
|
|
932
|
-
|
|
933
813
|
for name, config in self.sequence_features.items():
|
|
934
814
|
vocab_sizes[name] = config.get('vocab_size', 0)
|
|
935
|
-
|
|
936
815
|
return vocab_sizes
|
|
937
816
|
|
|
938
817
|
def summary(self):
|