nextrec 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nextrec/__version__.py +1 -1
- nextrec/basic/model.py +288 -181
- nextrec/basic/summary.py +21 -4
- nextrec/cli.py +36 -17
- nextrec/data/__init__.py +0 -52
- nextrec/data/batch_utils.py +1 -1
- nextrec/data/data_processing.py +1 -35
- nextrec/data/data_utils.py +0 -4
- nextrec/data/dataloader.py +125 -103
- nextrec/data/preprocessor.py +141 -92
- nextrec/loss/__init__.py +0 -36
- nextrec/models/generative/__init__.py +0 -9
- nextrec/models/tree_base/__init__.py +0 -15
- nextrec/models/tree_base/base.py +14 -23
- nextrec/utils/__init__.py +0 -119
- nextrec/utils/data.py +39 -119
- nextrec/utils/model.py +5 -14
- nextrec/utils/torch_utils.py +6 -1
- {nextrec-0.5.0.dist-info → nextrec-0.5.2.dist-info}/METADATA +4 -5
- {nextrec-0.5.0.dist-info → nextrec-0.5.2.dist-info}/RECORD +23 -23
- {nextrec-0.5.0.dist-info → nextrec-0.5.2.dist-info}/WHEEL +0 -0
- {nextrec-0.5.0.dist-info → nextrec-0.5.2.dist-info}/entry_points.txt +0 -0
- {nextrec-0.5.0.dist-info → nextrec-0.5.2.dist-info}/licenses/LICENSE +0 -0
nextrec/data/dataloader.py
CHANGED
|
@@ -2,14 +2,13 @@
|
|
|
2
2
|
Dataloader definitions
|
|
3
3
|
|
|
4
4
|
Date: create on 27/10/2025
|
|
5
|
-
Checkpoint: edit on
|
|
5
|
+
Checkpoint: edit on 31/01/2026
|
|
6
6
|
Author: Yang Zhou,zyaztec@gmail.com
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
9
|
import logging
|
|
10
10
|
import os
|
|
11
11
|
from pathlib import Path
|
|
12
|
-
from typing import cast
|
|
13
12
|
|
|
14
13
|
import numpy as np
|
|
15
14
|
import pandas as pd
|
|
@@ -26,7 +25,6 @@ from nextrec.data.batch_utils import collate_fn
|
|
|
26
25
|
from nextrec.data.data_processing import get_column_data
|
|
27
26
|
from nextrec.data.preprocessor import DataProcessor
|
|
28
27
|
from nextrec.utils.data import (
|
|
29
|
-
check_streaming_support,
|
|
30
28
|
iter_file_chunks,
|
|
31
29
|
read_table,
|
|
32
30
|
resolve_file_paths,
|
|
@@ -85,6 +83,8 @@ class FileDataset(FeatureSet, IterableDataset):
|
|
|
85
83
|
chunk_size: int = 10000,
|
|
86
84
|
file_type: str = "csv",
|
|
87
85
|
processor: DataProcessor | None = None,
|
|
86
|
+
shard_rank: int = 0,
|
|
87
|
+
shard_count: int = 1,
|
|
88
88
|
):
|
|
89
89
|
"""Streaming dataset for reading files in chunks.
|
|
90
90
|
|
|
@@ -103,6 +103,8 @@ class FileDataset(FeatureSet, IterableDataset):
|
|
|
103
103
|
self.chunk_size = chunk_size
|
|
104
104
|
self.file_type = file_type
|
|
105
105
|
self.processor = processor
|
|
106
|
+
self.shard_rank = int(shard_rank)
|
|
107
|
+
self.shard_count = int(shard_count)
|
|
106
108
|
|
|
107
109
|
self.set_all_features(
|
|
108
110
|
dense_features,
|
|
@@ -111,43 +113,56 @@ class FileDataset(FeatureSet, IterableDataset):
|
|
|
111
113
|
target_columns,
|
|
112
114
|
id_columns,
|
|
113
115
|
)
|
|
114
|
-
self.current_file_index = 0
|
|
115
116
|
self.total_files = len(file_paths)
|
|
116
117
|
|
|
117
118
|
def __iter__(self):
|
|
118
|
-
self.
|
|
119
|
-
|
|
120
|
-
|
|
119
|
+
shard_count = max(int(self.shard_count), 1)
|
|
120
|
+
shard_rank = int(self.shard_rank) if shard_count > 1 else 0
|
|
121
|
+
|
|
122
|
+
# assign files to each worker
|
|
123
|
+
file_indices_all = list(range(self.total_files))
|
|
124
|
+
if shard_count > 1:
|
|
125
|
+
file_indices_all = [
|
|
126
|
+
idx for idx in file_indices_all if (idx % shard_count) == shard_rank
|
|
127
|
+
]
|
|
128
|
+
file_indices = file_indices_all
|
|
129
|
+
if not file_indices:
|
|
130
|
+
return
|
|
131
|
+
|
|
132
|
+
for file_index in file_indices:
|
|
133
|
+
file_path = self.file_paths[file_index]
|
|
134
|
+
chunk_index = 0
|
|
121
135
|
for chunk in iter_file_chunks(file_path, self.file_type, self.chunk_size):
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
136
|
+
if shard_count > 1 and self.total_files == 1:
|
|
137
|
+
if (chunk_index % shard_count) != shard_rank:
|
|
138
|
+
chunk_index += 1
|
|
139
|
+
continue
|
|
140
|
+
chunk_index += 1
|
|
141
|
+
if self.processor is not None:
|
|
142
|
+
if not self.processor.is_fitted:
|
|
143
|
+
raise ValueError(
|
|
144
|
+
"[DataLoader Error] DataProcessor must be fitted before using in streaming mode"
|
|
145
|
+
)
|
|
146
|
+
transformed_data = self.processor.transform(chunk, return_dict=True)
|
|
147
|
+
else:
|
|
148
|
+
transformed_data = chunk
|
|
149
|
+
# if data=str|os.pathlike; processor.transform(data, return_dict=False) will return file paths list
|
|
150
|
+
# which will casue error in build_tensors_from_data
|
|
151
|
+
if isinstance(transformed_data, list):
|
|
152
|
+
raise TypeError(
|
|
153
|
+
"[DataLoader Error] DataProcessor.transform returned file paths; use return_dict=True with in-memory data for streaming."
|
|
154
|
+
)
|
|
155
|
+
batch = build_tensors_from_data(
|
|
156
|
+
data=transformed_data,
|
|
157
|
+
raw_data=chunk,
|
|
158
|
+
features=self.all_features,
|
|
159
|
+
target_columns=self.target_columns,
|
|
160
|
+
id_columns=self.id_columns,
|
|
132
161
|
)
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
raise TypeError(
|
|
138
|
-
"[DataLoader Error] DataProcessor.transform returned file paths; use return_dict=True with in-memory data for streaming."
|
|
139
|
-
)
|
|
140
|
-
safe_data = cast(dict | pd.DataFrame, transformed_data)
|
|
141
|
-
batch = build_tensors_from_data(
|
|
142
|
-
data=safe_data,
|
|
143
|
-
raw_data=df,
|
|
144
|
-
features=self.all_features,
|
|
145
|
-
target_columns=self.target_columns,
|
|
146
|
-
id_columns=self.id_columns,
|
|
147
|
-
)
|
|
148
|
-
if batch is not None:
|
|
149
|
-
batch["_already_batched"] = True
|
|
150
|
-
return batch
|
|
162
|
+
# Indicate streaming mode for collate_fn to avoid extra batching.
|
|
163
|
+
batch["stream_mode"] = True
|
|
164
|
+
yield batch
|
|
165
|
+
del chunk, transformed_data
|
|
151
166
|
|
|
152
167
|
|
|
153
168
|
class RecDataLoader(FeatureSet):
|
|
@@ -183,8 +198,8 @@ class RecDataLoader(FeatureSet):
|
|
|
183
198
|
dict
|
|
184
199
|
| pd.DataFrame
|
|
185
200
|
| str
|
|
186
|
-
| os.PathLike
|
|
187
201
|
| list[str]
|
|
202
|
+
| os.PathLike
|
|
188
203
|
| list[os.PathLike]
|
|
189
204
|
| DataLoader
|
|
190
205
|
| None
|
|
@@ -195,10 +210,12 @@ class RecDataLoader(FeatureSet):
|
|
|
195
210
|
chunk_size: int = 10000,
|
|
196
211
|
num_workers: int = 0,
|
|
197
212
|
prefetch_factor: int | None = None,
|
|
213
|
+
shard_rank: int = 0,
|
|
214
|
+
shard_count: int = 1,
|
|
198
215
|
sampler=None,
|
|
199
216
|
) -> DataLoader:
|
|
200
217
|
"""
|
|
201
|
-
Create a DataLoader from various data sources.
|
|
218
|
+
Create a DataLoader from various data sources: dict, pd.DataFrame, file path(s), or existing DataLoader.
|
|
202
219
|
|
|
203
220
|
Args:
|
|
204
221
|
data: Data source, can be a dict, pd.DataFrame, file path (str), or existing DataLoader.
|
|
@@ -212,13 +229,6 @@ class RecDataLoader(FeatureSet):
|
|
|
212
229
|
Returns:
|
|
213
230
|
DataLoader instance.
|
|
214
231
|
"""
|
|
215
|
-
if streaming and num_workers > 0:
|
|
216
|
-
logging.warning(
|
|
217
|
-
f"[RecDataLoader Warning] num_workers={num_workers} is not compatible with streaming=True. "
|
|
218
|
-
"Each worker would create its own data stream, causing data duplication. "
|
|
219
|
-
"Forcing num_workers=0."
|
|
220
|
-
)
|
|
221
|
-
num_workers = 0
|
|
222
232
|
|
|
223
233
|
if isinstance(data, DataLoader):
|
|
224
234
|
return data
|
|
@@ -237,6 +247,8 @@ class RecDataLoader(FeatureSet):
|
|
|
237
247
|
chunk_size=chunk_size,
|
|
238
248
|
num_workers=num_workers,
|
|
239
249
|
prefetch_factor=prefetch_factor,
|
|
250
|
+
shard_rank=shard_rank,
|
|
251
|
+
shard_count=shard_count,
|
|
240
252
|
)
|
|
241
253
|
|
|
242
254
|
if isinstance(data, (dict, pd.DataFrame)):
|
|
@@ -260,6 +272,13 @@ class RecDataLoader(FeatureSet):
|
|
|
260
272
|
prefetch_factor: int | None = None,
|
|
261
273
|
sampler=None,
|
|
262
274
|
) -> DataLoader:
|
|
275
|
+
"""
|
|
276
|
+
Create a DataLoader from in-memory data. It builds a TensorDictDataset
|
|
277
|
+
that implements __getitem__ and __len__, allowing PyTorch DataLoader to
|
|
278
|
+
assign data to each worker.
|
|
279
|
+
"""
|
|
280
|
+
|
|
281
|
+
# keep a copy of raw data for id columns
|
|
263
282
|
raw_data = data
|
|
264
283
|
|
|
265
284
|
if self.processor is not None:
|
|
@@ -268,6 +287,7 @@ class RecDataLoader(FeatureSet):
|
|
|
268
287
|
"[RecDataLoader Error] DataProcessor must be fitted before transforming data in memory"
|
|
269
288
|
)
|
|
270
289
|
data = self.processor.transform(data, return_dict=True) # type: ignore
|
|
290
|
+
|
|
271
291
|
tensors = build_tensors_from_data(
|
|
272
292
|
data=data,
|
|
273
293
|
raw_data=raw_data,
|
|
@@ -275,14 +295,8 @@ class RecDataLoader(FeatureSet):
|
|
|
275
295
|
target_columns=self.target_columns,
|
|
276
296
|
id_columns=self.id_columns,
|
|
277
297
|
)
|
|
278
|
-
if tensors is None:
|
|
279
|
-
raise ValueError(
|
|
280
|
-
"[RecDataLoader Error] No valid tensors could be built from the provided data."
|
|
281
|
-
)
|
|
282
298
|
dataset = TensorDictDataset(tensors)
|
|
283
|
-
|
|
284
|
-
if num_workers > 0 and prefetch_factor is not None:
|
|
285
|
-
loader_kwargs["prefetch_factor"] = prefetch_factor
|
|
299
|
+
|
|
286
300
|
return DataLoader(
|
|
287
301
|
dataset,
|
|
288
302
|
batch_size=batch_size,
|
|
@@ -292,7 +306,7 @@ class RecDataLoader(FeatureSet):
|
|
|
292
306
|
num_workers=num_workers,
|
|
293
307
|
pin_memory=torch.cuda.is_available(),
|
|
294
308
|
persistent_workers=num_workers > 0,
|
|
295
|
-
|
|
309
|
+
prefetch_factor=prefetch_factor if num_workers > 0 else None,
|
|
296
310
|
)
|
|
297
311
|
|
|
298
312
|
def create_from_path(
|
|
@@ -304,7 +318,15 @@ class RecDataLoader(FeatureSet):
|
|
|
304
318
|
chunk_size: int = 10000,
|
|
305
319
|
num_workers: int = 0,
|
|
306
320
|
prefetch_factor: int | None = None,
|
|
321
|
+
shard_rank: int = 0,
|
|
322
|
+
shard_count: int = 1,
|
|
307
323
|
) -> DataLoader:
|
|
324
|
+
"""
|
|
325
|
+
Create a DataLoader from file paths. It builds either a streaming
|
|
326
|
+
IterableDataset (via __iter__) or an in-memory map-style dataset
|
|
327
|
+
(via __getitem__/__len__).
|
|
328
|
+
"""
|
|
329
|
+
|
|
308
330
|
if isinstance(path, (str, os.PathLike)):
|
|
309
331
|
file_paths, file_type = resolve_file_paths(str(Path(path)))
|
|
310
332
|
else:
|
|
@@ -312,11 +334,16 @@ class RecDataLoader(FeatureSet):
|
|
|
312
334
|
if not file_paths:
|
|
313
335
|
raise ValueError("[RecDataLoader Error] Empty file path list provided.")
|
|
314
336
|
|
|
315
|
-
from nextrec.utils.data import get_file_format_from_extension
|
|
316
|
-
|
|
317
337
|
file_formats = set()
|
|
318
338
|
for p in file_paths:
|
|
319
|
-
|
|
339
|
+
name = Path(p).name
|
|
340
|
+
ext = name.rsplit(".", 1)[-1].lower() if "." in name else ""
|
|
341
|
+
if ext in {"csv", "txt"}:
|
|
342
|
+
fmt = "csv"
|
|
343
|
+
elif ext == "parquet":
|
|
344
|
+
fmt = "parquet"
|
|
345
|
+
else:
|
|
346
|
+
fmt = None
|
|
320
347
|
if fmt is None:
|
|
321
348
|
raise ValueError(
|
|
322
349
|
f"[RecDataLoader Error] Unsupported file extension: {Path(p).suffix}"
|
|
@@ -329,44 +356,35 @@ class RecDataLoader(FeatureSet):
|
|
|
329
356
|
"Please use a single format per DataLoader."
|
|
330
357
|
)
|
|
331
358
|
file_type = file_formats.pop()
|
|
359
|
+
|
|
332
360
|
if streaming:
|
|
361
|
+
# streaming mode with IterableDataset will
|
|
362
|
+
# keep num_workers=0 and prefetch_factor=None
|
|
333
363
|
return self.load_files_streaming(
|
|
334
364
|
file_paths,
|
|
335
365
|
file_type,
|
|
336
366
|
batch_size,
|
|
337
367
|
chunk_size,
|
|
338
368
|
shuffle,
|
|
339
|
-
num_workers=
|
|
340
|
-
prefetch_factor=
|
|
369
|
+
num_workers=0,
|
|
370
|
+
prefetch_factor=None,
|
|
371
|
+
shard_rank=shard_rank,
|
|
372
|
+
shard_count=shard_count,
|
|
341
373
|
)
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
try:
|
|
347
|
-
total_bytes += os.path.getsize(file_path)
|
|
348
|
-
except OSError:
|
|
349
|
-
pass
|
|
350
|
-
try:
|
|
374
|
+
else:
|
|
375
|
+
# read all files into memory
|
|
376
|
+
dfs = []
|
|
377
|
+
for file_path in file_paths:
|
|
351
378
|
df = read_table(file_path, data_format=file_type)
|
|
352
379
|
dfs.append(df)
|
|
353
|
-
except MemoryError as exc:
|
|
354
|
-
raise MemoryError(
|
|
355
|
-
f"[RecDataLoader Error] Out of memory while reading {file_path}. Consider using streaming=True."
|
|
356
|
-
) from exc
|
|
357
|
-
try:
|
|
358
380
|
combined_df = pd.concat(dfs, ignore_index=True)
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
shuffle,
|
|
367
|
-
num_workers=num_workers,
|
|
368
|
-
prefetch_factor=prefetch_factor,
|
|
369
|
-
)
|
|
381
|
+
return self.create_from_memory(
|
|
382
|
+
combined_df,
|
|
383
|
+
batch_size,
|
|
384
|
+
shuffle,
|
|
385
|
+
num_workers=num_workers,
|
|
386
|
+
prefetch_factor=prefetch_factor,
|
|
387
|
+
)
|
|
370
388
|
|
|
371
389
|
def load_files_streaming(
|
|
372
390
|
self,
|
|
@@ -377,12 +395,9 @@ class RecDataLoader(FeatureSet):
|
|
|
377
395
|
shuffle: bool,
|
|
378
396
|
num_workers: int = 0,
|
|
379
397
|
prefetch_factor: int | None = None,
|
|
398
|
+
shard_rank: int = 0,
|
|
399
|
+
shard_count: int = 1,
|
|
380
400
|
) -> DataLoader:
|
|
381
|
-
if not check_streaming_support(file_type):
|
|
382
|
-
raise ValueError(
|
|
383
|
-
f"[RecDataLoader Error] Format '{file_type}' does not support streaming reads. "
|
|
384
|
-
"Use streaming=False or convert data to csv/parquet."
|
|
385
|
-
)
|
|
386
401
|
if shuffle:
|
|
387
402
|
logging.info(
|
|
388
403
|
"[RecDataLoader Info] Shuffle is ignored in streaming mode (IterableDataset)."
|
|
@@ -391,13 +406,7 @@ class RecDataLoader(FeatureSet):
|
|
|
391
406
|
logging.info(
|
|
392
407
|
"[RecDataLoader Info] Streaming mode enforces batch_size=1; tune chunk_size to control memory/throughput."
|
|
393
408
|
)
|
|
394
|
-
|
|
395
|
-
logging.warning(
|
|
396
|
-
f"[RecDataLoader Warning] num_workers={num_workers} is not compatible with streaming mode. "
|
|
397
|
-
"Each worker would create its own data stream, causing data duplication. "
|
|
398
|
-
"Forcing num_workers=0."
|
|
399
|
-
)
|
|
400
|
-
num_workers = 0
|
|
409
|
+
# iterable dataset for streaming, implements __iter__
|
|
401
410
|
dataset = FileDataset(
|
|
402
411
|
file_paths=file_paths,
|
|
403
412
|
dense_features=self.dense_features,
|
|
@@ -408,20 +417,26 @@ class RecDataLoader(FeatureSet):
|
|
|
408
417
|
chunk_size=chunk_size,
|
|
409
418
|
file_type=file_type,
|
|
410
419
|
processor=self.processor,
|
|
420
|
+
shard_rank=shard_rank,
|
|
421
|
+
shard_count=shard_count,
|
|
411
422
|
)
|
|
412
|
-
loader_kwargs = {}
|
|
413
|
-
if num_workers > 0 and prefetch_factor is not None:
|
|
414
|
-
loader_kwargs["prefetch_factor"] = prefetch_factor
|
|
415
423
|
return DataLoader(
|
|
416
424
|
dataset,
|
|
417
425
|
batch_size=1,
|
|
418
426
|
collate_fn=collate_fn,
|
|
419
|
-
num_workers=
|
|
420
|
-
|
|
427
|
+
num_workers=0,
|
|
428
|
+
prefetch_factor=None,
|
|
429
|
+
pin_memory=torch.cuda.is_available(),
|
|
421
430
|
)
|
|
422
431
|
|
|
423
432
|
|
|
424
|
-
def
|
|
433
|
+
def prepare_sequence_column(column, feature: SequenceFeature) -> np.ndarray:
|
|
434
|
+
"""
|
|
435
|
+
Normalize a sequence feature column into a padded int64 numpy array.
|
|
436
|
+
Converts scalars/lists/arrays into a consistent 2D shape and applies
|
|
437
|
+
truncation/padding based on the feature definition.
|
|
438
|
+
"""
|
|
439
|
+
|
|
425
440
|
if isinstance(column, pd.Series):
|
|
426
441
|
column = column.tolist()
|
|
427
442
|
if isinstance(column, (list, tuple)):
|
|
@@ -473,7 +488,12 @@ def build_tensors_from_data(
|
|
|
473
488
|
features: list,
|
|
474
489
|
target_columns: list[str],
|
|
475
490
|
id_columns: list[str],
|
|
476
|
-
) -> dict
|
|
491
|
+
) -> dict:
|
|
492
|
+
"""
|
|
493
|
+
Build feature, label, and ID tensors from raw input using feature definitions.
|
|
494
|
+
This is used by RecDataLoader to construct model-ready batches.
|
|
495
|
+
"""
|
|
496
|
+
|
|
477
497
|
feature_tensors = {}
|
|
478
498
|
for feature in features:
|
|
479
499
|
column = get_column_data(data, feature.name)
|
|
@@ -482,7 +502,7 @@ def build_tensors_from_data(
|
|
|
482
502
|
f"[RecDataLoader Error] Feature column '{feature.name}' not found in data"
|
|
483
503
|
)
|
|
484
504
|
if isinstance(feature, SequenceFeature):
|
|
485
|
-
arr =
|
|
505
|
+
arr = prepare_sequence_column(column, feature)
|
|
486
506
|
tensor = to_tensor(arr, dtype=torch.long)
|
|
487
507
|
elif isinstance(feature, DenseFeature):
|
|
488
508
|
arr = np.asarray(column, dtype=np.float32)
|
|
@@ -526,5 +546,7 @@ def build_tensors_from_data(
|
|
|
526
546
|
# Normalize all id columns to strings for consistent downstream handling.
|
|
527
547
|
id_tensors[id_col] = np.asarray(column, dtype=str)
|
|
528
548
|
if not feature_tensors:
|
|
529
|
-
|
|
549
|
+
raise ValueError(
|
|
550
|
+
"[RecDataLoader Error] No valid tensors could be built from the provided data."
|
|
551
|
+
)
|
|
530
552
|
return {"features": feature_tensors, "labels": label_tensors, "ids": id_tensors}
|