nextrec 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,14 +2,13 @@
2
2
  Dataloader definitions
3
3
 
4
4
  Date: create on 27/10/2025
5
- Checkpoint: edit on 01/01/2026
5
+ Checkpoint: edit on 31/01/2026
6
6
  Author: Yang Zhou,zyaztec@gmail.com
7
7
  """
8
8
 
9
9
  import logging
10
10
  import os
11
11
  from pathlib import Path
12
- from typing import cast
13
12
 
14
13
  import numpy as np
15
14
  import pandas as pd
@@ -26,7 +25,6 @@ from nextrec.data.batch_utils import collate_fn
26
25
  from nextrec.data.data_processing import get_column_data
27
26
  from nextrec.data.preprocessor import DataProcessor
28
27
  from nextrec.utils.data import (
29
- check_streaming_support,
30
28
  iter_file_chunks,
31
29
  read_table,
32
30
  resolve_file_paths,
@@ -85,6 +83,8 @@ class FileDataset(FeatureSet, IterableDataset):
85
83
  chunk_size: int = 10000,
86
84
  file_type: str = "csv",
87
85
  processor: DataProcessor | None = None,
86
+ shard_rank: int = 0,
87
+ shard_count: int = 1,
88
88
  ):
89
89
  """Streaming dataset for reading files in chunks.
90
90
 
@@ -103,6 +103,8 @@ class FileDataset(FeatureSet, IterableDataset):
103
103
  self.chunk_size = chunk_size
104
104
  self.file_type = file_type
105
105
  self.processor = processor
106
+ self.shard_rank = int(shard_rank)
107
+ self.shard_count = int(shard_count)
106
108
 
107
109
  self.set_all_features(
108
110
  dense_features,
@@ -111,43 +113,56 @@ class FileDataset(FeatureSet, IterableDataset):
111
113
  target_columns,
112
114
  id_columns,
113
115
  )
114
- self.current_file_index = 0
115
116
  self.total_files = len(file_paths)
116
117
 
117
118
  def __iter__(self):
118
- self.current_file_index = 0
119
- for file_path in self.file_paths:
120
- self.current_file_index += 1
119
+ shard_count = max(int(self.shard_count), 1)
120
+ shard_rank = int(self.shard_rank) if shard_count > 1 else 0
121
+
122
+ # assign files to each worker
123
+ file_indices_all = list(range(self.total_files))
124
+ if shard_count > 1:
125
+ file_indices_all = [
126
+ idx for idx in file_indices_all if (idx % shard_count) == shard_rank
127
+ ]
128
+ file_indices = file_indices_all
129
+ if not file_indices:
130
+ return
131
+
132
+ for file_index in file_indices:
133
+ file_path = self.file_paths[file_index]
134
+ chunk_index = 0
121
135
  for chunk in iter_file_chunks(file_path, self.file_type, self.chunk_size):
122
- tensors = self.dataframeto_tensors(chunk)
123
- if tensors is not None:
124
- yield tensors
125
- del chunk
126
-
127
- def dataframeto_tensors(self, df: pd.DataFrame) -> dict | None:
128
- if self.processor is not None:
129
- if not self.processor.is_fitted:
130
- raise ValueError(
131
- "[DataLoader Error] DataProcessor must be fitted before using in streaming mode"
136
+ if shard_count > 1 and self.total_files == 1:
137
+ if (chunk_index % shard_count) != shard_rank:
138
+ chunk_index += 1
139
+ continue
140
+ chunk_index += 1
141
+ if self.processor is not None:
142
+ if not self.processor.is_fitted:
143
+ raise ValueError(
144
+ "[DataLoader Error] DataProcessor must be fitted before using in streaming mode"
145
+ )
146
+ transformed_data = self.processor.transform(chunk, return_dict=True)
147
+ else:
148
+ transformed_data = chunk
149
+ # if data=str|os.pathlike; processor.transform(data, return_dict=False) will return file paths list
150
+ # which will casue error in build_tensors_from_data
151
+ if isinstance(transformed_data, list):
152
+ raise TypeError(
153
+ "[DataLoader Error] DataProcessor.transform returned file paths; use return_dict=True with in-memory data for streaming."
154
+ )
155
+ batch = build_tensors_from_data(
156
+ data=transformed_data,
157
+ raw_data=chunk,
158
+ features=self.all_features,
159
+ target_columns=self.target_columns,
160
+ id_columns=self.id_columns,
132
161
  )
133
- transformed_data = self.processor.transform(df, return_dict=True)
134
- else:
135
- transformed_data = df
136
- if isinstance(transformed_data, list):
137
- raise TypeError(
138
- "[DataLoader Error] DataProcessor.transform returned file paths; use return_dict=True with in-memory data for streaming."
139
- )
140
- safe_data = cast(dict | pd.DataFrame, transformed_data)
141
- batch = build_tensors_from_data(
142
- data=safe_data,
143
- raw_data=df,
144
- features=self.all_features,
145
- target_columns=self.target_columns,
146
- id_columns=self.id_columns,
147
- )
148
- if batch is not None:
149
- batch["_already_batched"] = True
150
- return batch
162
+ # Indicate streaming mode for collate_fn to avoid extra batching.
163
+ batch["stream_mode"] = True
164
+ yield batch
165
+ del chunk, transformed_data
151
166
 
152
167
 
153
168
  class RecDataLoader(FeatureSet):
@@ -183,8 +198,8 @@ class RecDataLoader(FeatureSet):
183
198
  dict
184
199
  | pd.DataFrame
185
200
  | str
186
- | os.PathLike
187
201
  | list[str]
202
+ | os.PathLike
188
203
  | list[os.PathLike]
189
204
  | DataLoader
190
205
  | None
@@ -195,10 +210,12 @@ class RecDataLoader(FeatureSet):
195
210
  chunk_size: int = 10000,
196
211
  num_workers: int = 0,
197
212
  prefetch_factor: int | None = None,
213
+ shard_rank: int = 0,
214
+ shard_count: int = 1,
198
215
  sampler=None,
199
216
  ) -> DataLoader:
200
217
  """
201
- Create a DataLoader from various data sources.
218
+ Create a DataLoader from various data sources: dict, pd.DataFrame, file path(s), or existing DataLoader.
202
219
 
203
220
  Args:
204
221
  data: Data source, can be a dict, pd.DataFrame, file path (str), or existing DataLoader.
@@ -212,13 +229,6 @@ class RecDataLoader(FeatureSet):
212
229
  Returns:
213
230
  DataLoader instance.
214
231
  """
215
- if streaming and num_workers > 0:
216
- logging.warning(
217
- f"[RecDataLoader Warning] num_workers={num_workers} is not compatible with streaming=True. "
218
- "Each worker would create its own data stream, causing data duplication. "
219
- "Forcing num_workers=0."
220
- )
221
- num_workers = 0
222
232
 
223
233
  if isinstance(data, DataLoader):
224
234
  return data
@@ -237,6 +247,8 @@ class RecDataLoader(FeatureSet):
237
247
  chunk_size=chunk_size,
238
248
  num_workers=num_workers,
239
249
  prefetch_factor=prefetch_factor,
250
+ shard_rank=shard_rank,
251
+ shard_count=shard_count,
240
252
  )
241
253
 
242
254
  if isinstance(data, (dict, pd.DataFrame)):
@@ -260,6 +272,13 @@ class RecDataLoader(FeatureSet):
260
272
  prefetch_factor: int | None = None,
261
273
  sampler=None,
262
274
  ) -> DataLoader:
275
+ """
276
+ Create a DataLoader from in-memory data. It builds a TensorDictDataset
277
+ that implements __getitem__ and __len__, allowing PyTorch DataLoader to
278
+ assign data to each worker.
279
+ """
280
+
281
+ # keep a copy of raw data for id columns
263
282
  raw_data = data
264
283
 
265
284
  if self.processor is not None:
@@ -268,6 +287,7 @@ class RecDataLoader(FeatureSet):
268
287
  "[RecDataLoader Error] DataProcessor must be fitted before transforming data in memory"
269
288
  )
270
289
  data = self.processor.transform(data, return_dict=True) # type: ignore
290
+
271
291
  tensors = build_tensors_from_data(
272
292
  data=data,
273
293
  raw_data=raw_data,
@@ -275,14 +295,8 @@ class RecDataLoader(FeatureSet):
275
295
  target_columns=self.target_columns,
276
296
  id_columns=self.id_columns,
277
297
  )
278
- if tensors is None:
279
- raise ValueError(
280
- "[RecDataLoader Error] No valid tensors could be built from the provided data."
281
- )
282
298
  dataset = TensorDictDataset(tensors)
283
- loader_kwargs = {}
284
- if num_workers > 0 and prefetch_factor is not None:
285
- loader_kwargs["prefetch_factor"] = prefetch_factor
299
+
286
300
  return DataLoader(
287
301
  dataset,
288
302
  batch_size=batch_size,
@@ -292,7 +306,7 @@ class RecDataLoader(FeatureSet):
292
306
  num_workers=num_workers,
293
307
  pin_memory=torch.cuda.is_available(),
294
308
  persistent_workers=num_workers > 0,
295
- **loader_kwargs,
309
+ prefetch_factor=prefetch_factor if num_workers > 0 else None,
296
310
  )
297
311
 
298
312
  def create_from_path(
@@ -304,7 +318,15 @@ class RecDataLoader(FeatureSet):
304
318
  chunk_size: int = 10000,
305
319
  num_workers: int = 0,
306
320
  prefetch_factor: int | None = None,
321
+ shard_rank: int = 0,
322
+ shard_count: int = 1,
307
323
  ) -> DataLoader:
324
+ """
325
+ Create a DataLoader from file paths. It builds either a streaming
326
+ IterableDataset (via __iter__) or an in-memory map-style dataset
327
+ (via __getitem__/__len__).
328
+ """
329
+
308
330
  if isinstance(path, (str, os.PathLike)):
309
331
  file_paths, file_type = resolve_file_paths(str(Path(path)))
310
332
  else:
@@ -312,11 +334,16 @@ class RecDataLoader(FeatureSet):
312
334
  if not file_paths:
313
335
  raise ValueError("[RecDataLoader Error] Empty file path list provided.")
314
336
 
315
- from nextrec.utils.data import get_file_format_from_extension
316
-
317
337
  file_formats = set()
318
338
  for p in file_paths:
319
- fmt = get_file_format_from_extension(Path(p).suffix)
339
+ name = Path(p).name
340
+ ext = name.rsplit(".", 1)[-1].lower() if "." in name else ""
341
+ if ext in {"csv", "txt"}:
342
+ fmt = "csv"
343
+ elif ext == "parquet":
344
+ fmt = "parquet"
345
+ else:
346
+ fmt = None
320
347
  if fmt is None:
321
348
  raise ValueError(
322
349
  f"[RecDataLoader Error] Unsupported file extension: {Path(p).suffix}"
@@ -329,44 +356,35 @@ class RecDataLoader(FeatureSet):
329
356
  "Please use a single format per DataLoader."
330
357
  )
331
358
  file_type = file_formats.pop()
359
+
332
360
  if streaming:
361
+ # streaming mode with IterableDataset will
362
+ # keep num_workers=0 and prefetch_factor=None
333
363
  return self.load_files_streaming(
334
364
  file_paths,
335
365
  file_type,
336
366
  batch_size,
337
367
  chunk_size,
338
368
  shuffle,
339
- num_workers=num_workers,
340
- prefetch_factor=prefetch_factor,
369
+ num_workers=0,
370
+ prefetch_factor=None,
371
+ shard_rank=shard_rank,
372
+ shard_count=shard_count,
341
373
  )
342
-
343
- dfs = []
344
- total_bytes = 0
345
- for file_path in file_paths:
346
- try:
347
- total_bytes += os.path.getsize(file_path)
348
- except OSError:
349
- pass
350
- try:
374
+ else:
375
+ # read all files into memory
376
+ dfs = []
377
+ for file_path in file_paths:
351
378
  df = read_table(file_path, data_format=file_type)
352
379
  dfs.append(df)
353
- except MemoryError as exc:
354
- raise MemoryError(
355
- f"[RecDataLoader Error] Out of memory while reading {file_path}. Consider using streaming=True."
356
- ) from exc
357
- try:
358
380
  combined_df = pd.concat(dfs, ignore_index=True)
359
- except MemoryError as exc:
360
- raise MemoryError(
361
- f"[RecDataLoader Error] Out of memory while concatenating loaded data (approx {total_bytes / (1024**3):.2f} GB). Use streaming=True or reduce chunk_size."
362
- ) from exc
363
- return self.create_from_memory(
364
- combined_df,
365
- batch_size,
366
- shuffle,
367
- num_workers=num_workers,
368
- prefetch_factor=prefetch_factor,
369
- )
381
+ return self.create_from_memory(
382
+ combined_df,
383
+ batch_size,
384
+ shuffle,
385
+ num_workers=num_workers,
386
+ prefetch_factor=prefetch_factor,
387
+ )
370
388
 
371
389
  def load_files_streaming(
372
390
  self,
@@ -377,12 +395,9 @@ class RecDataLoader(FeatureSet):
377
395
  shuffle: bool,
378
396
  num_workers: int = 0,
379
397
  prefetch_factor: int | None = None,
398
+ shard_rank: int = 0,
399
+ shard_count: int = 1,
380
400
  ) -> DataLoader:
381
- if not check_streaming_support(file_type):
382
- raise ValueError(
383
- f"[RecDataLoader Error] Format '{file_type}' does not support streaming reads. "
384
- "Use streaming=False or convert data to csv/parquet."
385
- )
386
401
  if shuffle:
387
402
  logging.info(
388
403
  "[RecDataLoader Info] Shuffle is ignored in streaming mode (IterableDataset)."
@@ -391,13 +406,7 @@ class RecDataLoader(FeatureSet):
391
406
  logging.info(
392
407
  "[RecDataLoader Info] Streaming mode enforces batch_size=1; tune chunk_size to control memory/throughput."
393
408
  )
394
- if num_workers > 0:
395
- logging.warning(
396
- f"[RecDataLoader Warning] num_workers={num_workers} is not compatible with streaming mode. "
397
- "Each worker would create its own data stream, causing data duplication. "
398
- "Forcing num_workers=0."
399
- )
400
- num_workers = 0
409
+ # iterable dataset for streaming, implements __iter__
401
410
  dataset = FileDataset(
402
411
  file_paths=file_paths,
403
412
  dense_features=self.dense_features,
@@ -408,20 +417,26 @@ class RecDataLoader(FeatureSet):
408
417
  chunk_size=chunk_size,
409
418
  file_type=file_type,
410
419
  processor=self.processor,
420
+ shard_rank=shard_rank,
421
+ shard_count=shard_count,
411
422
  )
412
- loader_kwargs = {}
413
- if num_workers > 0 and prefetch_factor is not None:
414
- loader_kwargs["prefetch_factor"] = prefetch_factor
415
423
  return DataLoader(
416
424
  dataset,
417
425
  batch_size=1,
418
426
  collate_fn=collate_fn,
419
- num_workers=num_workers,
420
- **loader_kwargs,
427
+ num_workers=0,
428
+ prefetch_factor=None,
429
+ pin_memory=torch.cuda.is_available(),
421
430
  )
422
431
 
423
432
 
424
- def normalize_sequence_column(column, feature: SequenceFeature) -> np.ndarray:
433
+ def prepare_sequence_column(column, feature: SequenceFeature) -> np.ndarray:
434
+ """
435
+ Normalize a sequence feature column into a padded int64 numpy array.
436
+ Converts scalars/lists/arrays into a consistent 2D shape and applies
437
+ truncation/padding based on the feature definition.
438
+ """
439
+
425
440
  if isinstance(column, pd.Series):
426
441
  column = column.tolist()
427
442
  if isinstance(column, (list, tuple)):
@@ -473,7 +488,12 @@ def build_tensors_from_data(
473
488
  features: list,
474
489
  target_columns: list[str],
475
490
  id_columns: list[str],
476
- ) -> dict | None:
491
+ ) -> dict:
492
+ """
493
+ Build feature, label, and ID tensors from raw input using feature definitions.
494
+ This is used by RecDataLoader to construct model-ready batches.
495
+ """
496
+
477
497
  feature_tensors = {}
478
498
  for feature in features:
479
499
  column = get_column_data(data, feature.name)
@@ -482,7 +502,7 @@ def build_tensors_from_data(
482
502
  f"[RecDataLoader Error] Feature column '{feature.name}' not found in data"
483
503
  )
484
504
  if isinstance(feature, SequenceFeature):
485
- arr = normalize_sequence_column(column, feature)
505
+ arr = prepare_sequence_column(column, feature)
486
506
  tensor = to_tensor(arr, dtype=torch.long)
487
507
  elif isinstance(feature, DenseFeature):
488
508
  arr = np.asarray(column, dtype=np.float32)
@@ -526,5 +546,7 @@ def build_tensors_from_data(
526
546
  # Normalize all id columns to strings for consistent downstream handling.
527
547
  id_tensors[id_col] = np.asarray(column, dtype=str)
528
548
  if not feature_tensors:
529
- return None
549
+ raise ValueError(
550
+ "[RecDataLoader Error] No valid tensors could be built from the provided data."
551
+ )
530
552
  return {"features": feature_tensors, "labels": label_tensors, "ids": id_tensors}