PyPI - nextrec - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl - Mend

nextrec 0.1.1py3-none-any.whl → 0.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

nextrec/__init__.py +4 -4
nextrec/__version__.py +1 -1
nextrec/basic/activation.py +10 -9
nextrec/basic/callback.py +1 -0
nextrec/basic/dataloader.py +168 -127
nextrec/basic/features.py +24 -27
nextrec/basic/layers.py +328 -159
nextrec/basic/loggers.py +50 -37
nextrec/basic/metrics.py +255 -147
nextrec/basic/model.py +817 -462
nextrec/data/__init__.py +5 -5
nextrec/data/data_utils.py +16 -12
nextrec/data/preprocessor.py +276 -252
nextrec/loss/__init__.py +12 -12
nextrec/loss/loss_utils.py +30 -22
nextrec/loss/match_losses.py +116 -83
nextrec/models/match/__init__.py +5 -5
nextrec/models/match/dssm.py +70 -61
nextrec/models/match/dssm_v2.py +61 -51
nextrec/models/match/mind.py +89 -71
nextrec/models/match/sdm.py +93 -81
nextrec/models/match/youtube_dnn.py +62 -53
nextrec/models/multi_task/esmm.py +49 -43
nextrec/models/multi_task/mmoe.py +65 -56
nextrec/models/multi_task/ple.py +92 -65
nextrec/models/multi_task/share_bottom.py +48 -42
nextrec/models/ranking/__init__.py +7 -7
nextrec/models/ranking/afm.py +39 -30
nextrec/models/ranking/autoint.py +70 -57
nextrec/models/ranking/dcn.py +43 -35
nextrec/models/ranking/deepfm.py +34 -28
nextrec/models/ranking/dien.py +115 -79
nextrec/models/ranking/din.py +84 -60
nextrec/models/ranking/fibinet.py +51 -35
nextrec/models/ranking/fm.py +28 -26
nextrec/models/ranking/masknet.py +31 -31
nextrec/models/ranking/pnn.py +30 -31
nextrec/models/ranking/widedeep.py +36 -31
nextrec/models/ranking/xdeepfm.py +46 -39
nextrec/utils/__init__.py +9 -9
nextrec/utils/embedding.py +1 -1
nextrec/utils/initializer.py +23 -15
nextrec/utils/optimizer.py +14 -10
{nextrec-0.1.1.dist-info → nextrec-0.1.2.dist-info}/METADATA +6 -40
nextrec-0.1.2.dist-info/RECORD +51 -0
nextrec-0.1.1.dist-info/RECORD +0 -51
{nextrec-0.1.1.dist-info → nextrec-0.1.2.dist-info}/WHEEL +0 -0
{nextrec-0.1.1.dist-info → nextrec-0.1.2.dist-info}/licenses/LICENSE +0 -0

nextrec/__init__.py CHANGED Viewed

@@ -12,18 +12,18 @@ Quick Start
 -----------
 >>> from nextrec.basic.features import DenseFeature, SparseFeature
 >>> from nextrec.models.ranking.deepfm import DeepFM
->>>
+>>>
 >>> # Define features
 >>> dense_features = [DenseFeature('age')]
 >>> sparse_features = [SparseFeature('category', vocab_size=100, embedding_dim=16)]
->>>
+>>>
 >>> # Build model
 >>> model = DeepFM(
 ...     dense_features=dense_features,
 ...     sparse_features=sparse_features,
 ...     targets=['label']
 ... )
->>>
+>>>
 >>> # Train model
 >>> model.fit(train_data=df_train, valid_data=df_valid)
 """
@@ -31,7 +31,7 @@ Quick Start
 from nextrec.__version__ import __version__
 __all__ = [
-    '__version__',
+    "__version__",
 ]
 # Package metadata

nextrec/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.1.1"
1	+ __version__ = "0.1.2"

nextrec/basic/activation.py CHANGED Viewed

@@ -14,40 +14,41 @@ class Dice(nn.Module):
     """
     Dice activation function from the paper:
     "Deep Interest Network for Click-Through Rate Prediction" (Zhou et al., 2018)
     Dice(x) = p(x) * x + (1 - p(x)) * alpha * x
     where p(x) = sigmoid((x - E[x]) / sqrt(Var[x] + epsilon))
     """
     def __init__(self, emb_size: int, epsilon: float = 1e-9):
         super(Dice, self).__init__()
         self.epsilon = epsilon
         self.alpha = nn.Parameter(torch.zeros(emb_size))
         self.bn = nn.BatchNorm1d(emb_size)
     def forward(self, x):
         # x shape: (batch_size, emb_size) or (batch_size, seq_len, emb_size)
         original_shape = x.shape
         if x.dim() == 3:
             # For 3D input (batch_size, seq_len, emb_size), reshape to 2D
             batch_size, seq_len, emb_size = x.shape
             x = x.view(-1, emb_size)
         x_norm = self.bn(x)
         p = torch.sigmoid(x_norm)
         output = p * x + (1 - p) * self.alpha * x
         if len(original_shape) == 3:
             output = output.view(original_shape)
         return output
 def activation_layer(activation: str, emb_size: int | None = None):
     """Create an activation layer based on the given activation name."""
     activation = activation.lower()
     if activation == "dice":
         if emb_size is None:
             raise ValueError("emb_size is required for Dice activation")
@@ -89,4 +90,4 @@ def activation_layer(activation: str, emb_size: int | None = None):
     elif activation in ["none", "linear", "identity"]:
         return nn.Identity()
     else:
-        raise ValueError(f"Unsupported activation function: {activation}")
+        raise ValueError(f"Unsupported activation function: {activation}")

nextrec/basic/callback.py CHANGED Viewed

@@ -8,6 +8,7 @@ Author:
 import copy
 class EarlyStopper(object):
     def __init__(self, patience: int = 20, mode: str = "max"):
         self.patience = patience

nextrec/basic/dataloader.py CHANGED Viewed

@@ -30,16 +30,18 @@ class FileDataset(IterableDataset):
     Iterable dataset for reading multiple files in batches.
     Supports CSV and Parquet files with chunk-based reading.
     """
-    def __init__(self,
-                 file_paths: list[str],                      # file paths to read, containing CSV or Parquet files
-                 dense_features: list[DenseFeature],         # dense feature definitions
-                 sparse_features: list[SparseFeature],       # sparse feature definitions
-                 sequence_features: list[SequenceFeature],   # sequence feature definitions
-                 target_columns: list[str],                   # target column names
-                 chunk_size: int = 10000,
-                 file_type: Literal['csv', 'parquet'] = 'csv',
-                 processor: Optional['DataProcessor'] = None): # optional DataProcessor for transformation
+    def __init__(
+        self,
+        file_paths: list[str],  # file paths to read, containing CSV or Parquet files
+        dense_features: list[DenseFeature],  # dense feature definitions
+        sparse_features: list[SparseFeature],  # sparse feature definitions
+        sequence_features: list[SequenceFeature],  # sequence feature definitions
+        target_columns: list[str],  # target column names
+        chunk_size: int = 10000,
+        file_type: Literal["csv", "parquet"] = "csv",
+        processor: Optional["DataProcessor"] = None,
+    ):  # optional DataProcessor for transformation
         self.file_paths = file_paths
         self.dense_features = dense_features
@@ -49,30 +51,30 @@ class FileDataset(IterableDataset):
         self.chunk_size = chunk_size
         self.file_type = file_type
         self.processor = processor
         self.all_features = dense_features + sparse_features + sequence_features
         self.feature_names = [f.name for f in self.all_features]
         self.current_file_index = 0
         self.total_files = len(file_paths)
     def __iter__(self) -> Iterator[tuple]:
         self.current_file_index = 0
         self._file_pbar = None
         # Create progress bar for file processing when multiple files
         if self.total_files > 1:
             self._file_pbar = tqdm.tqdm(
-                total=self.total_files,
-                desc="Files",
+                total=self.total_files,
+                desc="Files",
                 unit="file",
                 position=0,
                 leave=True,
-                bar_format='{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
+                bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
             )
         for file_path in self.file_paths:
             self.current_file_index += 1
             # Update file progress bar
             if self._file_pbar is not None:
                 self._file_pbar.update(1)
@@ -80,48 +82,51 @@ class FileDataset(IterableDataset):
                 # For single file, log the file name
                 file_name = os.path.basename(file_path)
                 logging.info(colorize(f"Processing file: {file_name}", color="cyan"))
-            if self.file_type == 'csv':
+            if self.file_type == "csv":
                 yield from self._read_csv_chunks(file_path)
-            elif self.file_type == 'parquet':
+            elif self.file_type == "parquet":
                 yield from self._read_parquet_chunks(file_path)
         # Close file progress bar
         if self._file_pbar is not None:
             self._file_pbar.close()
     def _read_csv_chunks(self, file_path: str) -> Iterator[tuple]:
         chunk_iterator = pd.read_csv(file_path, chunksize=self.chunk_size)
         for chunk in chunk_iterator:
             tensors = self._dataframe_to_tensors(chunk)
             if tensors:
                 yield tensors
     def _read_parquet_chunks(self, file_path: str) -> Iterator[tuple]:
         """
         Read parquet file in chunks to reduce memory footprint.
         Uses pyarrow's batch reading for true streaming.
         """
         import pyarrow.parquet as pq
         parquet_file = pq.ParquetFile(file_path)
         for batch in parquet_file.iter_batches(batch_size=self.chunk_size):
-            chunk = batch.to_pandas()
+            chunk = batch.to_pandas()
             tensors = self._dataframe_to_tensors(chunk)
             if tensors:
                 yield tensors
             del chunk
     def _dataframe_to_tensors(self, df: pd.DataFrame) -> tuple | None:
         if self.processor is not None:
             if not self.processor.is_fitted:
-                raise ValueError("DataProcessor must be fitted before using in streaming mode")
+                raise ValueError(
+                    "DataProcessor must be fitted before using in streaming mode"
+                )
             transformed_data = self.processor.transform(df, return_dict=True)
         else:
             transformed_data = df
         tensors = []
         # Process features
         for feature in self.all_features:
             if self.processor is not None:
@@ -131,10 +136,15 @@ class FileDataset(IterableDataset):
             else:
                 # Get data from original dataframe
                 if feature.name not in df.columns:
-                    logging.warning(colorize(f"Feature column '{feature.name}' not found in DataFrame", "yellow"))
+                    logging.warning(
+                        colorize(
+                            f"Feature column '{feature.name}' not found in DataFrame",
+                            "yellow",
+                        )
+                    )
                     continue
                 column_data = df[feature.name].values
             # Handle sequence features: convert to 2D array of shape (batch_size, seq_length)
             if isinstance(feature, SequenceFeature):
                 if isinstance(column_data, np.ndarray) and column_data.dtype == object:
@@ -143,25 +153,29 @@ class FileDataset(IterableDataset):
                     except (ValueError, TypeError) as e:
                         # Fallback: handle variable-length sequences by padding
                         sequences = []
-                        max_len = feature.max_len if hasattr(feature, 'max_len') else 0
+                        max_len = feature.max_len if hasattr(feature, "max_len") else 0
                         for seq in column_data:
                             if isinstance(seq, (list, tuple, np.ndarray)):
                                 seq_arr = np.asarray(seq, dtype=np.int64)
                             else:
                                 seq_arr = np.array([], dtype=np.int64)
                             sequences.append(seq_arr)
                         # Pad sequences to same length
                         if max_len == 0:
-                            max_len = max(len(seq) for seq in sequences) if sequences else 1
+                            max_len = (
+                                max(len(seq) for seq in sequences) if sequences else 1
+                            )
                         padded = []
                         for seq in sequences:
                             if len(seq) > max_len:
                                 padded.append(seq[:max_len])
                             else:
                                 pad_width = max_len - len(seq)
-                                padded.append(np.pad(seq, (0, pad_width), constant_values=0))
+                                padded.append(
+                                    np.pad(seq, (0, pad_width), constant_values=0)
+                                )
                         column_data = np.stack(padded)
                 else:
                     column_data = np.asarray(column_data, dtype=np.int64)
@@ -170,43 +184,43 @@ class FileDataset(IterableDataset):
                 tensor = torch.from_numpy(np.asarray(column_data, dtype=np.float32))
             else:  # SparseFeature
                 tensor = torch.from_numpy(np.asarray(column_data, dtype=np.int64))
             tensors.append(tensor)
         # Process targets
         target_tensors = []
         for target_name in self.target_columns:
             if self.processor is not None:
                 target_data = transformed_data.get(target_name)
-                if target_data is None:
+                if target_data is None:
                     continue
             else:
                 if target_name not in df.columns:
                     continue
                 target_data = df[target_name].values
             target_tensor = torch.from_numpy(np.asarray(target_data, dtype=np.float32))
             if target_tensor.dim() == 1:
                 target_tensor = target_tensor.view(-1, 1)
             target_tensors.append(target_tensor)
         # Combine target tensors
         if target_tensors:
             if len(target_tensors) == 1 and target_tensors[0].shape[1] > 1:
                 y_tensor = target_tensors[0]
             else:
                 y_tensor = torch.cat(target_tensors, dim=1)
             if y_tensor.shape[1] == 1:
                 y_tensor = y_tensor.squeeze(1)
             tensors.append(y_tensor)
         if not tensors:
             return None
         return tuple(tensors)
@@ -226,13 +240,15 @@ class RecDataLoader:
         >>>     processor=processor
         >>> )
     """
-    def __init__(self,
-                 dense_features: list[DenseFeature] | None = None,
-                 sparse_features: list[SparseFeature] | None = None,
-                 sequence_features: list[SequenceFeature] | None = None,
-                 target: list[str] | None | str = None,
-                 processor: Optional['DataProcessor'] = None):
+    def __init__(
+        self,
+        dense_features: list[DenseFeature] | None = None,
+        sparse_features: list[SparseFeature] | None = None,
+        sequence_features: list[SequenceFeature] | None = None,
+        target: list[str] | None | str = None,
+        processor: Optional["DataProcessor"] = None,
+    ):
         self.dense_features = dense_features if dense_features else []
         self.sparse_features = sparse_features if sparse_features else []
@@ -244,41 +260,48 @@ class RecDataLoader:
         else:
             self.target_columns = []
         self.processor = processor
-        self.all_features = self.dense_features + self.sparse_features + self.sequence_features
-    def create_dataloader(self,
-                         data: Union[dict, pd.DataFrame, str, DataLoader],
-                         batch_size: int = 32,
-                         shuffle: bool = True,
-                         load_full: bool = True,
-                         chunk_size: int = 10000) -> DataLoader:
+        self.all_features = (
+            self.dense_features + self.sparse_features + self.sequence_features
+        )
+    def create_dataloader(
+        self,
+        data: Union[dict, pd.DataFrame, str, DataLoader],
+        batch_size: int = 32,
+        shuffle: bool = True,
+        load_full: bool = True,
+        chunk_size: int = 10000,
+    ) -> DataLoader:
         """
         Create DataLoader from various data sources.
         """
         if isinstance(data, DataLoader):
             return data
         if isinstance(data, (str, os.PathLike)):
-            return self._create_from_path(data, batch_size, shuffle, load_full, chunk_size)
+            return self._create_from_path(
+                data, batch_size, shuffle, load_full, chunk_size
+            )
         if isinstance(data, (dict, pd.DataFrame)):
             return self._create_from_memory(data, batch_size, shuffle)
         raise ValueError(f"Unsupported data type: {type(data)}")
-    def _create_from_memory(self,
-                           data: Union[dict, pd.DataFrame],
-                           batch_size: int,
-                           shuffle: bool) -> DataLoader:
+    def _create_from_memory(
+        self, data: Union[dict, pd.DataFrame], batch_size: int, shuffle: bool
+    ) -> DataLoader:
         if self.processor is not None:
             if not self.processor.is_fitted:
-                raise ValueError("DataProcessor must be fitted before using in RecDataLoader")
+                raise ValueError(
+                    "DataProcessor must be fitted before using in RecDataLoader"
+                )
             data = self.processor.transform(data, return_dict=True)
         tensors = []
         # Process features
         for feature in self.all_features:
             column = get_column_data(data, feature.name)
@@ -288,97 +311,111 @@ class RecDataLoader:
             if isinstance(feature, SequenceFeature):
                 if isinstance(column, pd.Series):
                     column = column.values
                 # Handle different input formats for sequence features
                 if isinstance(column, np.ndarray):
                     # Check if elements are actually sequences (not just object dtype scalars)
-                    if column.dtype == object and len(column) > 0 and isinstance(column[0], (list, tuple, np.ndarray)):
+                    if (
+                        column.dtype == object
+                        and len(column) > 0
+                        and isinstance(column[0], (list, tuple, np.ndarray))
+                    ):
                         # Each element is a sequence (array/list), stack them into 2D array
                         try:
                             column = np.stack([np.asarray(seq, dtype=np.int64) for seq in column])  # type: ignore
                         except (ValueError, TypeError) as e:
                             # Fallback: handle variable-length sequences by padding
                             sequences = []
-                            max_len = feature.max_len if hasattr(feature, 'max_len') else 0
+                            max_len = (
+                                feature.max_len if hasattr(feature, "max_len") else 0
+                            )
                             for seq in column:
                                 if isinstance(seq, (list, tuple, np.ndarray)):
                                     seq_arr = np.asarray(seq, dtype=np.int64)
                                 else:
                                     seq_arr = np.array([], dtype=np.int64)
                                 sequences.append(seq_arr)
                             # Pad sequences to same length
                             if max_len == 0:
-                                max_len = max(len(seq) for seq in sequences) if sequences else 1
+                                max_len = (
+                                    max(len(seq) for seq in sequences)
+                                    if sequences
+                                    else 1
+                                )
                             padded = []
                             for seq in sequences:
                                 if len(seq) > max_len:
                                     padded.append(seq[:max_len])
                                 else:
                                     pad_width = max_len - len(seq)
-                                    padded.append(np.pad(seq, (0, pad_width), constant_values=0))
+                                    padded.append(
+                                        np.pad(seq, (0, pad_width), constant_values=0)
+                                    )
                             column = np.stack(padded)
                     elif column.ndim == 1:
                         # 1D array, need to reshape or handle appropriately
                         # Assuming each element should be treated as a single-item sequence
                         column = column.reshape(-1, 1)
                     # else: already a 2D array
                 column = np.asarray(column, dtype=np.int64)
                 tensor = torch.from_numpy(column)
             elif isinstance(feature, DenseFeature):
                 tensor = torch.from_numpy(np.asarray(column, dtype=np.float32))
             else:  # SparseFeature
                 tensor = torch.from_numpy(np.asarray(column, dtype=np.int64))
             tensors.append(tensor)
         # Process targets
         label_tensors = []
         for target_name in self.target_columns:
             column = get_column_data(data, target_name)
             if column is None:
                 continue
             label_tensor = torch.from_numpy(np.asarray(column, dtype=np.float32))
             if label_tensor.dim() == 1:
                 label_tensor = label_tensor.view(-1, 1)
             elif label_tensor.dim() == 2:
                 if label_tensor.shape[0] == 1 and label_tensor.shape[1] > 1:
                     label_tensor = label_tensor.t()
             label_tensors.append(label_tensor)
         # Combine target tensors
         if label_tensors:
             if len(label_tensors) == 1 and label_tensors[0].shape[1] > 1:
                 y_tensor = label_tensors[0]
             else:
                 y_tensor = torch.cat(label_tensors, dim=1)
             if y_tensor.shape[1] == 1:
                 y_tensor = y_tensor.squeeze(1)
             tensors.append(y_tensor)
         dataset = TensorDataset(*tensors)
         return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
-    def _create_from_path(self,
-                         path: str,
-                         batch_size: int,
-                         shuffle: bool,
-                         load_full: bool,
-                         chunk_size: int) -> DataLoader:
+    def _create_from_path(
+        self,
+        path: str,
+        batch_size: int,
+        shuffle: bool,
+        load_full: bool,
+        chunk_size: int,
+    ) -> DataLoader:
         """
         Create DataLoader from a file path, supporting CSV and Parquet formats, with options for full loading or streaming.
         """
         path_obj = Path(path)
         # Determine if it's a file or directory
         if path_obj.is_file():
             file_paths = [str(path_obj)]
@@ -387,41 +424,46 @@ class RecDataLoader:
             # Find all CSV and Parquet files in directory
             csv_files = glob.glob(os.path.join(path, "*.csv"))
             parquet_files = glob.glob(os.path.join(path, "*.parquet"))
             if csv_files and parquet_files:
-                raise ValueError("Directory contains both CSV and Parquet files. Please use a single format.")
+                raise ValueError(
+                    "Directory contains both CSV and Parquet files. Please use a single format."
+                )
             file_paths = csv_files if csv_files else parquet_files
             if not file_paths:
                 raise ValueError(f"No CSV or Parquet files found in directory: {path}")
-            file_type = 'csv' if csv_files else 'parquet'
+            file_type = "csv" if csv_files else "parquet"
             file_paths.sort()  # Sort for consistent ordering
         else:
             raise ValueError(f"Invalid path: {path}")
         # Load full data into memory or use streaming
         if load_full:
             dfs = []
             for file_path in file_paths:
-                if file_type == 'csv':
+                if file_type == "csv":
                     df = pd.read_csv(file_path)
                 else:  # parquet
                     df = pd.read_parquet(file_path)
                 dfs.append(df)
             combined_df = pd.concat(dfs, ignore_index=True)
             return self._create_from_memory(combined_df, batch_size, shuffle)
         else:
-            return self._load_files_streaming(file_paths, file_type, batch_size, chunk_size)
-    def _load_files_streaming(self,
-                             file_paths: list[str],
-                             file_type: Literal['csv', 'parquet'],
-                             batch_size: int,
-                             chunk_size: int) -> DataLoader:
+            return self._load_files_streaming(
+                file_paths, file_type, batch_size, chunk_size
+            )
+    def _load_files_streaming(
+        self,
+        file_paths: list[str],
+        file_type: Literal["csv", "parquet"],
+        batch_size: int,
+        chunk_size: int,
+    ) -> DataLoader:
         # Create FileDataset for streaming
         dataset = FileDataset(
             file_paths=file_paths,
@@ -431,17 +473,16 @@ class RecDataLoader:
             target_columns=self.target_columns,
             chunk_size=chunk_size,
             file_type=file_type,
-            processor=self.processor
+            processor=self.processor,
         )
         return DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn)
-    def _get_file_type(self, file_path: str) -> Literal['csv', 'parquet']:
+    def _get_file_type(self, file_path: str) -> Literal["csv", "parquet"]:
         ext = os.path.splitext(file_path)[1].lower()
-        if ext == '.csv':
-            return 'csv'
-        elif ext == '.parquet':
-            return 'parquet'
+        if ext == ".csv":
+            return "csv"
+        elif ext == ".parquet":
+            return "parquet"
         else:
             raise ValueError(f"Unsupported file type: {ext}")

nextrec 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

nextrec 0.1.1py3-none-any.whl → 0.1.2py3-none-any.whl