nextrec 0.4.8__py3-none-any.whl → 0.4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nextrec/__version__.py +1 -1
- nextrec/basic/callback.py +30 -15
- nextrec/basic/features.py +1 -0
- nextrec/basic/layers.py +6 -8
- nextrec/basic/loggers.py +14 -7
- nextrec/basic/metrics.py +6 -76
- nextrec/basic/model.py +312 -318
- nextrec/cli.py +5 -10
- nextrec/data/__init__.py +13 -16
- nextrec/data/batch_utils.py +3 -2
- nextrec/data/data_processing.py +10 -2
- nextrec/data/data_utils.py +9 -14
- nextrec/data/dataloader.py +12 -13
- nextrec/data/preprocessor.py +328 -255
- nextrec/loss/__init__.py +1 -5
- nextrec/loss/loss_utils.py +2 -8
- nextrec/models/generative/__init__.py +1 -8
- nextrec/models/generative/hstu.py +6 -4
- nextrec/models/multi_task/esmm.py +2 -2
- nextrec/models/multi_task/mmoe.py +2 -2
- nextrec/models/multi_task/ple.py +2 -2
- nextrec/models/multi_task/poso.py +2 -3
- nextrec/models/multi_task/share_bottom.py +2 -2
- nextrec/models/ranking/afm.py +2 -2
- nextrec/models/ranking/autoint.py +2 -2
- nextrec/models/ranking/dcn.py +2 -2
- nextrec/models/ranking/dcn_v2.py +2 -2
- nextrec/models/ranking/deepfm.py +2 -2
- nextrec/models/ranking/dien.py +3 -3
- nextrec/models/ranking/din.py +3 -3
- nextrec/models/ranking/ffm.py +0 -0
- nextrec/models/ranking/fibinet.py +5 -5
- nextrec/models/ranking/fm.py +3 -7
- nextrec/models/ranking/lr.py +0 -0
- nextrec/models/ranking/masknet.py +2 -2
- nextrec/models/ranking/pnn.py +2 -2
- nextrec/models/ranking/widedeep.py +2 -2
- nextrec/models/ranking/xdeepfm.py +2 -2
- nextrec/models/representation/__init__.py +9 -0
- nextrec/models/{generative → representation}/rqvae.py +9 -9
- nextrec/models/retrieval/__init__.py +0 -0
- nextrec/models/{match → retrieval}/dssm.py +8 -3
- nextrec/models/{match → retrieval}/dssm_v2.py +8 -3
- nextrec/models/{match → retrieval}/mind.py +4 -3
- nextrec/models/{match → retrieval}/sdm.py +4 -3
- nextrec/models/{match → retrieval}/youtube_dnn.py +8 -3
- nextrec/utils/__init__.py +60 -46
- nextrec/utils/config.py +8 -7
- nextrec/utils/console.py +371 -0
- nextrec/utils/{synthetic_data.py → data.py} +102 -15
- nextrec/utils/feature.py +15 -0
- nextrec/utils/torch_utils.py +411 -0
- {nextrec-0.4.8.dist-info → nextrec-0.4.9.dist-info}/METADATA +6 -6
- nextrec-0.4.9.dist-info/RECORD +70 -0
- nextrec/utils/cli_utils.py +0 -58
- nextrec/utils/device.py +0 -78
- nextrec/utils/distributed.py +0 -141
- nextrec/utils/file.py +0 -92
- nextrec/utils/initializer.py +0 -79
- nextrec/utils/optimizer.py +0 -75
- nextrec/utils/tensor.py +0 -72
- nextrec-0.4.8.dist-info/RECORD +0 -71
- /nextrec/models/{match/__init__.py → ranking/eulernet.py} +0 -0
- {nextrec-0.4.8.dist-info → nextrec-0.4.9.dist-info}/WHEEL +0 -0
- {nextrec-0.4.8.dist-info → nextrec-0.4.9.dist-info}/entry_points.txt +0 -0
- {nextrec-0.4.8.dist-info → nextrec-0.4.9.dist-info}/licenses/LICENSE +0 -0
nextrec/data/preprocessor.py
CHANGED
|
@@ -2,46 +2,48 @@
|
|
|
2
2
|
DataProcessor for data preprocessing including numeric, sparse, sequence features and target processing.
|
|
3
3
|
|
|
4
4
|
Date: create on 13/11/2025
|
|
5
|
-
Checkpoint: edit on
|
|
5
|
+
Checkpoint: edit on 19/12/2025
|
|
6
6
|
Author: Yang Zhou, zyaztec@gmail.com
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
9
|
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import functools
|
|
12
|
+
import logging
|
|
10
13
|
import os
|
|
11
14
|
import pickle
|
|
12
|
-
import
|
|
13
|
-
import
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Any, Dict, Literal, Optional, Union
|
|
17
|
+
|
|
14
18
|
import numpy as np
|
|
15
19
|
import pandas as pd
|
|
16
|
-
|
|
17
|
-
import
|
|
18
|
-
from pathlib import Path
|
|
19
|
-
from typing import Dict, Union, Optional, Literal, Any
|
|
20
|
+
import pyarrow as pa
|
|
21
|
+
import pyarrow.parquet as pq
|
|
20
22
|
from sklearn.preprocessing import (
|
|
21
|
-
|
|
23
|
+
LabelEncoder,
|
|
24
|
+
MaxAbsScaler,
|
|
22
25
|
MinMaxScaler,
|
|
23
26
|
RobustScaler,
|
|
24
|
-
|
|
25
|
-
LabelEncoder,
|
|
27
|
+
StandardScaler,
|
|
26
28
|
)
|
|
27
29
|
|
|
28
|
-
|
|
30
|
+
from nextrec.__version__ import __version__
|
|
29
31
|
from nextrec.basic.features import FeatureSet
|
|
30
32
|
from nextrec.basic.loggers import colorize
|
|
31
33
|
from nextrec.basic.session import resolve_save_path
|
|
32
|
-
from nextrec.
|
|
33
|
-
|
|
34
|
+
from nextrec.data.data_processing import hash_md5_mod
|
|
35
|
+
from nextrec.utils.console import progress
|
|
36
|
+
from nextrec.utils.data import (
|
|
37
|
+
default_output_dir,
|
|
34
38
|
iter_file_chunks,
|
|
35
|
-
read_table,
|
|
36
39
|
load_dataframes,
|
|
37
|
-
|
|
40
|
+
read_table,
|
|
41
|
+
resolve_file_paths,
|
|
38
42
|
)
|
|
39
43
|
|
|
40
|
-
from nextrec.__version__ import __version__
|
|
41
|
-
|
|
42
44
|
|
|
43
45
|
class DataProcessor(FeatureSet):
|
|
44
|
-
def __init__(self):
|
|
46
|
+
def __init__(self, hash_cache_size: int = 200_000):
|
|
45
47
|
self.numeric_features: Dict[str, Dict[str, Any]] = {}
|
|
46
48
|
self.sparse_features: Dict[str, Dict[str, Any]] = {}
|
|
47
49
|
self.sequence_features: Dict[str, Dict[str, Any]] = {}
|
|
@@ -56,7 +58,16 @@ class DataProcessor(FeatureSet):
|
|
|
56
58
|
self.scalers: Dict[str, Any] = {}
|
|
57
59
|
self.label_encoders: Dict[str, LabelEncoder] = {}
|
|
58
60
|
self.target_encoders: Dict[str, Dict[str, int]] = {}
|
|
59
|
-
self.set_target_id([], [])
|
|
61
|
+
self.set_target_id(target=[], id_columns=[])
|
|
62
|
+
|
|
63
|
+
# cache hash function
|
|
64
|
+
self.hash_cache_size = int(hash_cache_size)
|
|
65
|
+
if self.hash_cache_size > 0:
|
|
66
|
+
self.hash_fn = functools.lru_cache(maxsize=self.hash_cache_size)(
|
|
67
|
+
hash_md5_mod
|
|
68
|
+
)
|
|
69
|
+
else:
|
|
70
|
+
self.hash_fn = hash_md5_mod
|
|
60
71
|
|
|
61
72
|
def add_numeric_feature(
|
|
62
73
|
self,
|
|
@@ -76,7 +87,9 @@ class DataProcessor(FeatureSet):
|
|
|
76
87
|
fill_na: str = "<UNK>",
|
|
77
88
|
):
|
|
78
89
|
if encode_method == "hash" and hash_size is None:
|
|
79
|
-
raise ValueError(
|
|
90
|
+
raise ValueError(
|
|
91
|
+
"[Data Processor Error] hash_size must be specified when encode_method='hash'"
|
|
92
|
+
)
|
|
80
93
|
self.sparse_features[name] = {
|
|
81
94
|
"encode_method": encode_method,
|
|
82
95
|
"hash_size": hash_size,
|
|
@@ -96,7 +109,9 @@ class DataProcessor(FeatureSet):
|
|
|
96
109
|
separator: str = ",",
|
|
97
110
|
):
|
|
98
111
|
if encode_method == "hash" and hash_size is None:
|
|
99
|
-
raise ValueError(
|
|
112
|
+
raise ValueError(
|
|
113
|
+
"[Data Processor Error] hash_size must be specified when encode_method='hash'"
|
|
114
|
+
)
|
|
100
115
|
self.sequence_features[name] = {
|
|
101
116
|
"encode_method": encode_method,
|
|
102
117
|
"hash_size": hash_size,
|
|
@@ -109,7 +124,7 @@ class DataProcessor(FeatureSet):
|
|
|
109
124
|
def add_target(
|
|
110
125
|
self,
|
|
111
126
|
name: str, # example: 'click'
|
|
112
|
-
target_type: Literal["binary", "
|
|
127
|
+
target_type: Literal["binary", "regression"] = "binary",
|
|
113
128
|
label_map: Optional[
|
|
114
129
|
Dict[str, int]
|
|
115
130
|
] = None, # example: {'click': 1, 'no_click': 0}
|
|
@@ -121,7 +136,18 @@ class DataProcessor(FeatureSet):
|
|
|
121
136
|
self.set_target_id(list(self.target_features.keys()), [])
|
|
122
137
|
|
|
123
138
|
def hash_string(self, s: str, hash_size: int) -> int:
|
|
124
|
-
return
|
|
139
|
+
return self.hash_fn(str(s), int(hash_size))
|
|
140
|
+
|
|
141
|
+
def clear_hash_cache(self) -> None:
|
|
142
|
+
cache_clear = getattr(self.hash_fn, "cache_clear", None)
|
|
143
|
+
if callable(cache_clear):
|
|
144
|
+
cache_clear()
|
|
145
|
+
|
|
146
|
+
def hash_cache_info(self):
|
|
147
|
+
cache_info = getattr(self.hash_fn, "cache_info", None)
|
|
148
|
+
if callable(cache_info):
|
|
149
|
+
return cache_info()
|
|
150
|
+
return None
|
|
125
151
|
|
|
126
152
|
def process_numeric_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
|
|
127
153
|
name = str(data.name)
|
|
@@ -132,21 +158,22 @@ class DataProcessor(FeatureSet):
|
|
|
132
158
|
# Default use mean value to fill missing values for numeric features
|
|
133
159
|
fill_na = data.mean()
|
|
134
160
|
config["fill_na_value"] = fill_na
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
scaler = MaxAbsScaler()
|
|
143
|
-
elif scaler_type == "log":
|
|
144
|
-
scaler = None
|
|
145
|
-
elif scaler_type == "none":
|
|
161
|
+
scaler_map = {
|
|
162
|
+
"standard": StandardScaler,
|
|
163
|
+
"minmax": MinMaxScaler,
|
|
164
|
+
"robust": RobustScaler,
|
|
165
|
+
"maxabs": MaxAbsScaler,
|
|
166
|
+
}
|
|
167
|
+
if scaler_type in ("log", "none"):
|
|
146
168
|
scaler = None
|
|
147
169
|
else:
|
|
148
|
-
|
|
149
|
-
|
|
170
|
+
scaler_cls = scaler_map.get(scaler_type)
|
|
171
|
+
if scaler_cls is None:
|
|
172
|
+
raise ValueError(
|
|
173
|
+
f"[Data Processor Error] Unknown scaler type: {scaler_type}"
|
|
174
|
+
)
|
|
175
|
+
scaler = scaler_cls()
|
|
176
|
+
if scaler is not None:
|
|
150
177
|
filled_data = data.fillna(config.get("fill_na_value", 0))
|
|
151
178
|
values = np.array(filled_data.values, dtype=np.float64).reshape(-1, 1)
|
|
152
179
|
scaler.fit(values)
|
|
@@ -177,15 +204,18 @@ class DataProcessor(FeatureSet):
|
|
|
177
204
|
return result
|
|
178
205
|
|
|
179
206
|
def process_sparse_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
|
|
180
|
-
|
|
207
|
+
_ = str(data.name)
|
|
181
208
|
encode_method = config["encode_method"]
|
|
182
209
|
fill_na = config["fill_na"] # <UNK>
|
|
183
210
|
filled_data = data.fillna(fill_na).astype(str)
|
|
184
211
|
if encode_method == "label":
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
212
|
+
vocab = sorted(set(filled_data.tolist()))
|
|
213
|
+
if "<UNK>" not in vocab:
|
|
214
|
+
vocab.append("<UNK>")
|
|
215
|
+
token_to_idx = {token: idx for idx, token in enumerate(vocab)}
|
|
216
|
+
config["_token_to_idx"] = token_to_idx
|
|
217
|
+
config["_unk_index"] = token_to_idx["<UNK>"]
|
|
218
|
+
config["vocab_size"] = len(vocab)
|
|
189
219
|
elif encode_method == "hash":
|
|
190
220
|
config["vocab_size"] = config["hash_size"]
|
|
191
221
|
|
|
@@ -195,18 +225,32 @@ class DataProcessor(FeatureSet):
|
|
|
195
225
|
name = str(data.name)
|
|
196
226
|
encode_method = config["encode_method"]
|
|
197
227
|
fill_na = config["fill_na"]
|
|
198
|
-
|
|
228
|
+
|
|
229
|
+
sparse_series = (
|
|
230
|
+
data if isinstance(data, pd.Series) else pd.Series(data, name=name)
|
|
231
|
+
)
|
|
232
|
+
sparse_series = sparse_series.fillna(fill_na).astype(str)
|
|
199
233
|
if encode_method == "label":
|
|
234
|
+
token_to_idx = config.get("_token_to_idx")
|
|
235
|
+
if isinstance(token_to_idx, dict):
|
|
236
|
+
unk_index = int(config.get("_unk_index", 0))
|
|
237
|
+
return np.fromiter(
|
|
238
|
+
(token_to_idx.get(v, unk_index) for v in sparse_series.to_numpy()),
|
|
239
|
+
dtype=np.int64,
|
|
240
|
+
count=sparse_series.size,
|
|
241
|
+
)
|
|
200
242
|
le = self.label_encoders.get(name)
|
|
201
243
|
if le is None:
|
|
202
|
-
raise ValueError(
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
244
|
+
raise ValueError(
|
|
245
|
+
f"[Data Processor Error] LabelEncoder for {name} not fitted"
|
|
246
|
+
)
|
|
247
|
+
cat = pd.Categorical(sparse_series, categories=le.classes_)
|
|
248
|
+
codes = cat.codes # -1 indicates unknown category
|
|
249
|
+
unk_index = 0
|
|
250
|
+
if "<UNK>" in le.classes_:
|
|
251
|
+
unk_index = int(list(le.classes_).index("<UNK>"))
|
|
252
|
+
return np.where(codes < 0, unk_index, codes).astype(np.int64, copy=False)
|
|
253
|
+
|
|
210
254
|
if encode_method == "hash":
|
|
211
255
|
hash_size = config["hash_size"]
|
|
212
256
|
hash_fn = self.hash_string
|
|
@@ -218,35 +262,22 @@ class DataProcessor(FeatureSet):
|
|
|
218
262
|
return np.array([], dtype=np.int64)
|
|
219
263
|
|
|
220
264
|
def process_sequence_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
|
|
221
|
-
|
|
265
|
+
_ = str(data.name)
|
|
222
266
|
encode_method = config["encode_method"]
|
|
223
267
|
separator = config["separator"]
|
|
224
268
|
if encode_method == "label":
|
|
225
269
|
all_tokens = set()
|
|
226
270
|
for seq in data:
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
elif isinstance(seq, (list, tuple)):
|
|
238
|
-
tokens = [str(t) for t in seq]
|
|
239
|
-
elif isinstance(seq, np.ndarray):
|
|
240
|
-
tokens = [str(t) for t in seq.tolist()]
|
|
241
|
-
else:
|
|
242
|
-
continue
|
|
243
|
-
all_tokens.update(tokens)
|
|
244
|
-
if len(all_tokens) == 0:
|
|
245
|
-
all_tokens.add("<PAD>")
|
|
246
|
-
le = LabelEncoder()
|
|
247
|
-
le.fit(list(all_tokens))
|
|
248
|
-
self.label_encoders[name] = le
|
|
249
|
-
config["vocab_size"] = len(le.classes_)
|
|
271
|
+
all_tokens.update(self.extract_sequence_tokens(seq, separator))
|
|
272
|
+
vocab = sorted(all_tokens)
|
|
273
|
+
if not vocab:
|
|
274
|
+
vocab = ["<PAD>"]
|
|
275
|
+
if "<UNK>" not in vocab:
|
|
276
|
+
vocab.append("<UNK>")
|
|
277
|
+
token_to_idx = {token: idx for idx, token in enumerate(vocab)}
|
|
278
|
+
config["_token_to_idx"] = token_to_idx
|
|
279
|
+
config["_unk_index"] = token_to_idx["<UNK>"]
|
|
280
|
+
config["vocab_size"] = len(vocab)
|
|
250
281
|
elif encode_method == "hash":
|
|
251
282
|
config["vocab_size"] = config["hash_size"]
|
|
252
283
|
|
|
@@ -267,15 +298,17 @@ class DataProcessor(FeatureSet):
|
|
|
267
298
|
split_fn = str.split
|
|
268
299
|
is_nan = np.isnan
|
|
269
300
|
if encode_method == "label":
|
|
270
|
-
|
|
271
|
-
if le is None:
|
|
272
|
-
raise ValueError(f"LabelEncoder for {name} not fitted")
|
|
273
|
-
class_to_idx = config.get("_class_to_idx")
|
|
301
|
+
class_to_idx = config.get("_token_to_idx") or config.get("_class_to_idx")
|
|
274
302
|
if class_to_idx is None:
|
|
303
|
+
le = self.label_encoders.get(name)
|
|
304
|
+
if le is None:
|
|
305
|
+
raise ValueError(f"LabelEncoder for {name} not fitted")
|
|
275
306
|
class_to_idx = {cls: idx for idx, cls in enumerate(le.classes_)}
|
|
276
307
|
config["_class_to_idx"] = class_to_idx
|
|
308
|
+
unk_index = int(config.get("_unk_index", class_to_idx.get("<UNK>", 0)))
|
|
277
309
|
else:
|
|
278
310
|
class_to_idx = None # type: ignore
|
|
311
|
+
unk_index = 0
|
|
279
312
|
hash_fn = self.hash_string
|
|
280
313
|
hash_size = config.get("hash_size")
|
|
281
314
|
for i, seq in enumerate(arr):
|
|
@@ -294,13 +327,15 @@ class DataProcessor(FeatureSet):
|
|
|
294
327
|
tokens = []
|
|
295
328
|
if encode_method == "label":
|
|
296
329
|
encoded = [
|
|
297
|
-
class_to_idx.get(token.strip(),
|
|
330
|
+
class_to_idx.get(token.strip(), unk_index) # type: ignore[union-attr]
|
|
298
331
|
for token in tokens
|
|
299
332
|
if token is not None and token != ""
|
|
300
333
|
]
|
|
301
334
|
elif encode_method == "hash":
|
|
302
335
|
if hash_size is None:
|
|
303
|
-
raise ValueError(
|
|
336
|
+
raise ValueError(
|
|
337
|
+
"[Data Processor Error] hash_size must be set for hash encoding"
|
|
338
|
+
)
|
|
304
339
|
encoded = [
|
|
305
340
|
hash_fn(str(token), hash_size)
|
|
306
341
|
for token in tokens
|
|
@@ -319,7 +354,7 @@ class DataProcessor(FeatureSet):
|
|
|
319
354
|
name = str(data.name)
|
|
320
355
|
target_type = config["target_type"]
|
|
321
356
|
label_map = config.get("label_map")
|
|
322
|
-
if target_type
|
|
357
|
+
if target_type == "binary":
|
|
323
358
|
if label_map is None:
|
|
324
359
|
unique_values = data.dropna().unique()
|
|
325
360
|
sorted_values = sorted(unique_values)
|
|
@@ -345,10 +380,12 @@ class DataProcessor(FeatureSet):
|
|
|
345
380
|
if target_type == "regression":
|
|
346
381
|
values = np.array(data.values, dtype=np.float32)
|
|
347
382
|
return values
|
|
348
|
-
|
|
383
|
+
if target_type == "binary":
|
|
349
384
|
label_map = self.target_encoders.get(name)
|
|
350
385
|
if label_map is None:
|
|
351
|
-
raise ValueError(
|
|
386
|
+
raise ValueError(
|
|
387
|
+
f"[Data Processor Error] Target encoder for {name} not fitted"
|
|
388
|
+
)
|
|
352
389
|
result = []
|
|
353
390
|
for val in data:
|
|
354
391
|
str_val = str(val)
|
|
@@ -357,9 +394,10 @@ class DataProcessor(FeatureSet):
|
|
|
357
394
|
else:
|
|
358
395
|
logger.warning(f"Unknown target value: {val}, mapping to 0")
|
|
359
396
|
result.append(0)
|
|
360
|
-
return np.array(
|
|
361
|
-
|
|
362
|
-
|
|
397
|
+
return np.array(result, dtype=np.float32)
|
|
398
|
+
raise ValueError(
|
|
399
|
+
f"[Data Processor Error] Unsupported target type: {target_type}"
|
|
400
|
+
)
|
|
363
401
|
|
|
364
402
|
def load_dataframe_from_path(self, path: str) -> pd.DataFrame:
|
|
365
403
|
"""Load all data from a file or directory path into a single DataFrame."""
|
|
@@ -414,51 +452,47 @@ class DataProcessor(FeatureSet):
|
|
|
414
452
|
missing_features = set()
|
|
415
453
|
for file_path in file_paths:
|
|
416
454
|
for chunk in iter_file_chunks(file_path, file_type, chunk_size):
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
tokens.extend(self.extract_sequence_tokens(val, separator))
|
|
455
|
-
seq_vocab[name].update(tokens)
|
|
455
|
+
columns = set(chunk.columns)
|
|
456
|
+
feature_groups = [
|
|
457
|
+
("numeric", self.numeric_features),
|
|
458
|
+
("sparse", self.sparse_features),
|
|
459
|
+
("sequence", self.sequence_features),
|
|
460
|
+
]
|
|
461
|
+
for group, features in feature_groups:
|
|
462
|
+
missing_features.update(features.keys() - columns)
|
|
463
|
+
for name in features.keys() & columns:
|
|
464
|
+
config = features[name]
|
|
465
|
+
series = chunk[name]
|
|
466
|
+
if group == "numeric":
|
|
467
|
+
values = pd.to_numeric(series, errors="coerce").dropna()
|
|
468
|
+
if values.empty:
|
|
469
|
+
continue
|
|
470
|
+
acc = numeric_acc[name]
|
|
471
|
+
arr = values.to_numpy(dtype=np.float64, copy=False)
|
|
472
|
+
acc["count"] += arr.size
|
|
473
|
+
acc["sum"] += float(arr.sum())
|
|
474
|
+
acc["sumsq"] += float(np.square(arr).sum())
|
|
475
|
+
acc["min"] = min(acc["min"], float(arr.min()))
|
|
476
|
+
acc["max"] = max(acc["max"], float(arr.max()))
|
|
477
|
+
acc["max_abs"] = max(
|
|
478
|
+
acc["max_abs"], float(np.abs(arr).max())
|
|
479
|
+
)
|
|
480
|
+
elif group == "sparse":
|
|
481
|
+
fill_na = config["fill_na"]
|
|
482
|
+
series = series.fillna(fill_na).astype(str)
|
|
483
|
+
sparse_vocab[name].update(series.tolist())
|
|
484
|
+
else:
|
|
485
|
+
separator = config["separator"]
|
|
486
|
+
tokens = []
|
|
487
|
+
for val in series:
|
|
488
|
+
tokens.extend(
|
|
489
|
+
self.extract_sequence_tokens(val, separator)
|
|
490
|
+
)
|
|
491
|
+
seq_vocab[name].update(tokens)
|
|
456
492
|
|
|
457
493
|
# target features
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
missing_features.add(name)
|
|
461
|
-
continue
|
|
494
|
+
missing_features.update(self.target_features.keys() - columns)
|
|
495
|
+
for name in self.target_features.keys() & columns:
|
|
462
496
|
vals = chunk[name].dropna().tolist()
|
|
463
497
|
target_values[name].update(vals)
|
|
464
498
|
if missing_features:
|
|
@@ -489,6 +523,7 @@ class DataProcessor(FeatureSet):
|
|
|
489
523
|
)
|
|
490
524
|
scaler.n_samples_seen_ = np.array([int(acc["count"])], dtype=np.int64)
|
|
491
525
|
self.scalers[name] = scaler
|
|
526
|
+
|
|
492
527
|
elif scaler_type == "minmax":
|
|
493
528
|
data_min = acc["min"] if np.isfinite(acc["min"]) else 0.0
|
|
494
529
|
data_max = acc["max"] if np.isfinite(acc["max"]) else data_min
|
|
@@ -504,11 +539,13 @@ class DataProcessor(FeatureSet):
|
|
|
504
539
|
scaler.min_ = feature_min - scaler.data_min_ * scale
|
|
505
540
|
scaler.n_samples_seen_ = np.array([int(acc["count"])], dtype=np.int64)
|
|
506
541
|
self.scalers[name] = scaler
|
|
542
|
+
|
|
507
543
|
elif scaler_type == "maxabs":
|
|
508
544
|
scaler = MaxAbsScaler()
|
|
509
545
|
scaler.max_abs_ = np.array([acc["max_abs"]], dtype=np.float64)
|
|
510
546
|
scaler.n_samples_seen_ = np.array([int(acc["count"])], dtype=np.int64)
|
|
511
547
|
self.scalers[name] = scaler
|
|
548
|
+
|
|
512
549
|
elif scaler_type in ("log", "none", "robust"):
|
|
513
550
|
# log and none do not require fitting; robust requires full data and is handled earlier
|
|
514
551
|
continue
|
|
@@ -522,21 +559,27 @@ class DataProcessor(FeatureSet):
|
|
|
522
559
|
if not vocab:
|
|
523
560
|
logger.warning(f"Sparse feature {name} has empty vocabulary")
|
|
524
561
|
continue
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
562
|
+
vocab_list = sorted(vocab)
|
|
563
|
+
if "<UNK>" not in vocab_list:
|
|
564
|
+
vocab_list.append("<UNK>")
|
|
565
|
+
token_to_idx = {token: idx for idx, token in enumerate(vocab_list)}
|
|
566
|
+
config["_token_to_idx"] = token_to_idx
|
|
567
|
+
config["_unk_index"] = token_to_idx["<UNK>"]
|
|
568
|
+
config["vocab_size"] = len(vocab_list)
|
|
529
569
|
elif config["encode_method"] == "hash":
|
|
530
570
|
config["vocab_size"] = config["hash_size"]
|
|
531
571
|
|
|
532
572
|
# finalize sequence vocabularies
|
|
533
573
|
for name, config in self.sequence_features.items():
|
|
534
574
|
if config["encode_method"] == "label":
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
575
|
+
vocab_set = seq_vocab[name]
|
|
576
|
+
vocab_list = sorted(vocab_set) if vocab_set else ["<PAD>"]
|
|
577
|
+
if "<UNK>" not in vocab_list:
|
|
578
|
+
vocab_list.append("<UNK>")
|
|
579
|
+
token_to_idx = {token: idx for idx, token in enumerate(vocab_list)}
|
|
580
|
+
config["_token_to_idx"] = token_to_idx
|
|
581
|
+
config["_unk_index"] = token_to_idx["<UNK>"]
|
|
582
|
+
config["vocab_size"] = len(vocab_list)
|
|
540
583
|
elif config["encode_method"] == "hash":
|
|
541
584
|
config["vocab_size"] = config["hash_size"]
|
|
542
585
|
|
|
@@ -545,37 +588,14 @@ class DataProcessor(FeatureSet):
|
|
|
545
588
|
if not target_values[name]:
|
|
546
589
|
logger.warning(f"Target {name} has no valid values in provided files")
|
|
547
590
|
continue
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
unique_values = list(target_values[name])
|
|
552
|
-
try:
|
|
553
|
-
sorted_values = sorted(unique_values)
|
|
554
|
-
except TypeError:
|
|
555
|
-
sorted_values = sorted(unique_values, key=lambda x: str(x))
|
|
556
|
-
|
|
557
|
-
label_map = config["label_map"]
|
|
558
|
-
if label_map is None:
|
|
559
|
-
try:
|
|
560
|
-
int_values = [int(v) for v in sorted_values]
|
|
561
|
-
if int_values == list(range(len(int_values))):
|
|
562
|
-
label_map = {str(val): int(val) for val in sorted_values}
|
|
563
|
-
else:
|
|
564
|
-
label_map = {
|
|
565
|
-
str(val): idx for idx, val in enumerate(sorted_values)
|
|
566
|
-
}
|
|
567
|
-
except (ValueError, TypeError):
|
|
568
|
-
label_map = {
|
|
569
|
-
str(val): idx for idx, val in enumerate(sorted_values)
|
|
570
|
-
}
|
|
571
|
-
config["label_map"] = label_map
|
|
572
|
-
|
|
573
|
-
self.target_encoders[name] = label_map
|
|
591
|
+
self.process_target_fit(
|
|
592
|
+
pd.Series(list(target_values[name]), name=name), config
|
|
593
|
+
)
|
|
574
594
|
|
|
575
595
|
self.is_fitted = True
|
|
576
596
|
logger.info(
|
|
577
597
|
colorize(
|
|
578
|
-
"DataProcessor fitted successfully
|
|
598
|
+
"DataProcessor fitted successfully",
|
|
579
599
|
color="green",
|
|
580
600
|
bold=True,
|
|
581
601
|
)
|
|
@@ -589,69 +609,59 @@ class DataProcessor(FeatureSet):
|
|
|
589
609
|
persist: bool,
|
|
590
610
|
save_format: Optional[Literal["csv", "parquet"]],
|
|
591
611
|
output_path: Optional[str],
|
|
612
|
+
warn_missing: bool = True,
|
|
592
613
|
) -> Union[pd.DataFrame, Dict[str, np.ndarray]]:
|
|
593
614
|
logger = logging.getLogger()
|
|
594
|
-
|
|
595
|
-
if isinstance(data,
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
615
|
+
is_dataframe = isinstance(data, pd.DataFrame)
|
|
616
|
+
data_dict = data if isinstance(data, dict) else None
|
|
617
|
+
|
|
618
|
+
result_dict: Dict[str, np.ndarray] = {}
|
|
619
|
+
if is_dataframe:
|
|
620
|
+
df: pd.DataFrame = data # type: ignore[assignment]
|
|
621
|
+
for col in df.columns:
|
|
622
|
+
result_dict[col] = df[col].to_numpy(copy=False)
|
|
599
623
|
else:
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
continue
|
|
635
|
-
series_data = pd.Series(data_dict[name], name=name)
|
|
636
|
-
processed = self.process_sequence_feature_transform(series_data, config)
|
|
637
|
-
result_dict[name] = processed
|
|
638
|
-
|
|
639
|
-
# process target features
|
|
640
|
-
for name, config in self.target_features.items():
|
|
641
|
-
if name not in data_dict:
|
|
642
|
-
logger.warning(f"Target {name} not found in data")
|
|
643
|
-
continue
|
|
644
|
-
series_data = pd.Series(data_dict[name], name=name)
|
|
645
|
-
processed = self.process_target_transform(series_data, config)
|
|
646
|
-
result_dict[name] = processed
|
|
624
|
+
if data_dict is None:
|
|
625
|
+
raise ValueError(
|
|
626
|
+
f"[Data Processor Error] Unsupported data type: {type(data)}"
|
|
627
|
+
)
|
|
628
|
+
for key, value in data_dict.items():
|
|
629
|
+
if isinstance(value, pd.Series):
|
|
630
|
+
result_dict[key] = value.to_numpy(copy=False)
|
|
631
|
+
else:
|
|
632
|
+
result_dict[key] = np.asarray(value)
|
|
633
|
+
|
|
634
|
+
data_columns = data.columns if is_dataframe else data_dict
|
|
635
|
+
feature_groups = [
|
|
636
|
+
("Numeric", self.numeric_features, self.process_numeric_feature_transform),
|
|
637
|
+
("Sparse", self.sparse_features, self.process_sparse_feature_transform),
|
|
638
|
+
(
|
|
639
|
+
"Sequence",
|
|
640
|
+
self.sequence_features,
|
|
641
|
+
self.process_sequence_feature_transform,
|
|
642
|
+
),
|
|
643
|
+
("Target", self.target_features, self.process_target_transform),
|
|
644
|
+
]
|
|
645
|
+
for label, features, transform_fn in feature_groups:
|
|
646
|
+
for name, config in features.items():
|
|
647
|
+
present = name in data_columns # type: ignore[operator]
|
|
648
|
+
if not present:
|
|
649
|
+
if warn_missing:
|
|
650
|
+
logger.warning(f"{label} feature {name} not found in data")
|
|
651
|
+
continue
|
|
652
|
+
series_data = (
|
|
653
|
+
data[name]
|
|
654
|
+
if is_dataframe
|
|
655
|
+
else pd.Series(result_dict[name], name=name)
|
|
656
|
+
)
|
|
657
|
+
result_dict[name] = transform_fn(series_data, config)
|
|
647
658
|
|
|
648
659
|
def dict_to_dataframe(result: Dict[str, np.ndarray]) -> pd.DataFrame:
|
|
649
660
|
# Convert all arrays to Series/lists at once to avoid fragmentation
|
|
650
661
|
columns_dict = {}
|
|
651
662
|
for key, value in result.items():
|
|
652
663
|
if key in self.sequence_features:
|
|
653
|
-
|
|
654
|
-
columns_dict[key] = [np.asarray(seq).tolist() for seq in value]
|
|
664
|
+
columns_dict[key] = np.asarray(value).tolist()
|
|
655
665
|
else:
|
|
656
666
|
columns_dict[key] = value
|
|
657
667
|
return pd.DataFrame(columns_dict)
|
|
@@ -667,7 +677,7 @@ class DataProcessor(FeatureSet):
|
|
|
667
677
|
if persist:
|
|
668
678
|
if output_path is None:
|
|
669
679
|
raise ValueError(
|
|
670
|
-
"output_path must be provided when persisting transformed data."
|
|
680
|
+
"[Data Processor Error] output_path must be provided when persisting transformed data."
|
|
671
681
|
)
|
|
672
682
|
output_dir = Path(output_path)
|
|
673
683
|
if output_dir.suffix:
|
|
@@ -694,8 +704,12 @@ class DataProcessor(FeatureSet):
|
|
|
694
704
|
input_path: str,
|
|
695
705
|
output_path: Optional[str],
|
|
696
706
|
save_format: Optional[Literal["csv", "parquet"]],
|
|
707
|
+
chunk_size: int = 200000,
|
|
697
708
|
) -> list[str]:
|
|
698
|
-
"""Transform data from files under a path and save them to a new location.
|
|
709
|
+
"""Transform data from files under a path and save them to a new location.
|
|
710
|
+
|
|
711
|
+
Uses chunked reading/writing to keep peak memory bounded for large files.
|
|
712
|
+
"""
|
|
699
713
|
logger = logging.getLogger()
|
|
700
714
|
file_paths, file_type = resolve_file_paths(input_path)
|
|
701
715
|
target_format = save_format or file_type
|
|
@@ -709,20 +723,82 @@ class DataProcessor(FeatureSet):
|
|
|
709
723
|
output_root = base_output_dir / "transformed_data"
|
|
710
724
|
output_root.mkdir(parents=True, exist_ok=True)
|
|
711
725
|
saved_paths = []
|
|
712
|
-
for file_path in
|
|
713
|
-
df = read_table(file_path, file_type)
|
|
714
|
-
transformed_df = self.transform_in_memory(
|
|
715
|
-
df, return_dict=False, persist=False, save_format=None, output_path=None
|
|
716
|
-
)
|
|
717
|
-
assert isinstance(
|
|
718
|
-
transformed_df, pd.DataFrame
|
|
719
|
-
), "Expected DataFrame when return_dict=False"
|
|
726
|
+
for file_path in progress(file_paths, description="Transforming files"):
|
|
720
727
|
source_path = Path(file_path)
|
|
721
728
|
target_file = output_root / f"{source_path.stem}.{target_format}"
|
|
722
|
-
|
|
723
|
-
|
|
729
|
+
|
|
730
|
+
# Stream transform for large files
|
|
731
|
+
|
|
732
|
+
if chunk_size <= 0:
|
|
733
|
+
# fallback to full load behavior
|
|
734
|
+
df = read_table(file_path, file_type)
|
|
735
|
+
transformed_df = self.transform_in_memory(
|
|
736
|
+
df,
|
|
737
|
+
return_dict=False,
|
|
738
|
+
persist=False,
|
|
739
|
+
save_format=None,
|
|
740
|
+
output_path=None,
|
|
741
|
+
warn_missing=True,
|
|
742
|
+
)
|
|
743
|
+
assert isinstance(
|
|
744
|
+
transformed_df, pd.DataFrame
|
|
745
|
+
), "[Data Processor Error] Expected DataFrame when return_dict=False"
|
|
746
|
+
if target_format == "csv":
|
|
747
|
+
transformed_df.to_csv(target_file, index=False)
|
|
748
|
+
else:
|
|
749
|
+
transformed_df.to_parquet(target_file, index=False)
|
|
750
|
+
saved_paths.append(str(target_file.resolve()))
|
|
751
|
+
continue
|
|
752
|
+
|
|
753
|
+
first_chunk = True
|
|
754
|
+
if target_format == "parquet":
|
|
755
|
+
writer: pq.ParquetWriter | None = None
|
|
756
|
+
try:
|
|
757
|
+
for chunk in iter_file_chunks(file_path, file_type, chunk_size):
|
|
758
|
+
transformed_df = self.transform_in_memory(
|
|
759
|
+
chunk,
|
|
760
|
+
return_dict=False,
|
|
761
|
+
persist=False,
|
|
762
|
+
save_format=None,
|
|
763
|
+
output_path=None,
|
|
764
|
+
warn_missing=first_chunk,
|
|
765
|
+
)
|
|
766
|
+
assert isinstance(
|
|
767
|
+
transformed_df, pd.DataFrame
|
|
768
|
+
), "[Data Processor Error] Expected DataFrame when return_dict=False"
|
|
769
|
+
table = pa.Table.from_pandas(
|
|
770
|
+
transformed_df, preserve_index=False
|
|
771
|
+
)
|
|
772
|
+
if writer is None:
|
|
773
|
+
writer = pq.ParquetWriter(target_file, table.schema)
|
|
774
|
+
writer.write_table(table)
|
|
775
|
+
first_chunk = False
|
|
776
|
+
finally:
|
|
777
|
+
if writer is not None:
|
|
778
|
+
writer.close()
|
|
724
779
|
else:
|
|
725
|
-
|
|
780
|
+
# CSV: append chunks; header only once
|
|
781
|
+
# (truncate first to avoid mixing with existing files)
|
|
782
|
+
target_file.parent.mkdir(parents=True, exist_ok=True)
|
|
783
|
+
with open(target_file, "w", encoding="utf-8", newline="") as f:
|
|
784
|
+
f.write("")
|
|
785
|
+
for chunk in iter_file_chunks(file_path, file_type, chunk_size):
|
|
786
|
+
transformed_df = self.transform_in_memory(
|
|
787
|
+
chunk,
|
|
788
|
+
return_dict=False,
|
|
789
|
+
persist=False,
|
|
790
|
+
save_format=None,
|
|
791
|
+
output_path=None,
|
|
792
|
+
warn_missing=first_chunk,
|
|
793
|
+
)
|
|
794
|
+
assert isinstance(
|
|
795
|
+
transformed_df, pd.DataFrame
|
|
796
|
+
), "[Data Processor Error] Expected DataFrame when return_dict=False"
|
|
797
|
+
transformed_df.to_csv(
|
|
798
|
+
target_file, index=False, mode="a", header=first_chunk
|
|
799
|
+
)
|
|
800
|
+
first_chunk = False
|
|
801
|
+
|
|
726
802
|
saved_paths.append(str(target_file.resolve()))
|
|
727
803
|
logger.info(
|
|
728
804
|
colorize(
|
|
@@ -754,26 +830,18 @@ class DataProcessor(FeatureSet):
|
|
|
754
830
|
if isinstance(data, dict):
|
|
755
831
|
data = pd.DataFrame(data)
|
|
756
832
|
logger.info(colorize("Fitting DataProcessor...", color="cyan", bold=True))
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
self.
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
logger.warning(f"Sequence feature {name} not found in data")
|
|
770
|
-
continue
|
|
771
|
-
self.process_sequence_feature_fit(data[name], config)
|
|
772
|
-
for name, config in self.target_features.items():
|
|
773
|
-
if name not in data.columns:
|
|
774
|
-
logger.warning(f"Target {name} not found in data")
|
|
775
|
-
continue
|
|
776
|
-
self.process_target_fit(data[name], config)
|
|
833
|
+
feature_groups = [
|
|
834
|
+
("Numeric", self.numeric_features, self.process_numeric_feature_fit),
|
|
835
|
+
("Sparse", self.sparse_features, self.process_sparse_feature_fit),
|
|
836
|
+
("Sequence", self.sequence_features, self.process_sequence_feature_fit),
|
|
837
|
+
("Target", self.target_features, self.process_target_fit),
|
|
838
|
+
]
|
|
839
|
+
for label, features, fit_fn in feature_groups:
|
|
840
|
+
for name, config in features.items():
|
|
841
|
+
if name not in data.columns:
|
|
842
|
+
logger.warning(f"{label} feature {name} not found in data")
|
|
843
|
+
continue
|
|
844
|
+
fit_fn(data[name], config)
|
|
777
845
|
self.is_fitted = True
|
|
778
846
|
return self
|
|
779
847
|
|
|
@@ -783,15 +851,20 @@ class DataProcessor(FeatureSet):
|
|
|
783
851
|
return_dict: bool = True,
|
|
784
852
|
save_format: Optional[Literal["csv", "parquet"]] = None,
|
|
785
853
|
output_path: Optional[str] = None,
|
|
854
|
+
chunk_size: int = 200000,
|
|
786
855
|
) -> Union[pd.DataFrame, Dict[str, np.ndarray], list[str]]:
|
|
787
856
|
if not self.is_fitted:
|
|
788
|
-
raise ValueError(
|
|
857
|
+
raise ValueError(
|
|
858
|
+
"[Data Processor Error] DataProcessor must be fitted before transform"
|
|
859
|
+
)
|
|
789
860
|
if isinstance(data, (str, os.PathLike)):
|
|
790
861
|
if return_dict:
|
|
791
862
|
raise ValueError(
|
|
792
|
-
"Path transform writes files only; set return_dict=False when passing a path."
|
|
863
|
+
"[Data Processor Error] Path transform writes files only; set return_dict=False when passing a path."
|
|
793
864
|
)
|
|
794
|
-
return self.transform_path(
|
|
865
|
+
return self.transform_path(
|
|
866
|
+
str(data), output_path, save_format, chunk_size=chunk_size
|
|
867
|
+
)
|
|
795
868
|
return self.transform_in_memory(
|
|
796
869
|
data=data,
|
|
797
870
|
return_dict=return_dict,
|