PyPI - torchtextclassifiers - Versions diffs - 0.0.1__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

torchtextclassifiers 0.0.1py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

torchTextClassifiers/__init__.py +12 -48
torchTextClassifiers/dataset/__init__.py +1 -0
torchTextClassifiers/dataset/dataset.py +152 -0
torchTextClassifiers/model/__init__.py +2 -0
torchTextClassifiers/model/components/__init__.py +12 -0
torchTextClassifiers/model/components/attention.py +126 -0
torchTextClassifiers/model/components/categorical_var_net.py +128 -0
torchTextClassifiers/model/components/classification_head.py +61 -0
torchTextClassifiers/model/components/text_embedder.py +220 -0
torchTextClassifiers/model/lightning.py +170 -0
torchTextClassifiers/model/model.py +151 -0
torchTextClassifiers/tokenizers/WordPiece.py +92 -0
torchTextClassifiers/tokenizers/__init__.py +10 -0
torchTextClassifiers/tokenizers/base.py +205 -0
torchTextClassifiers/tokenizers/ngram.py +472 -0
torchTextClassifiers/torchTextClassifiers.py +500 -413
torchTextClassifiers/utilities/__init__.py +0 -3
torchTextClassifiers/utilities/plot_explainability.py +184 -0
torchtextclassifiers-1.0.0.dist-info/METADATA +87 -0
torchtextclassifiers-1.0.0.dist-info/RECORD +21 -0
{torchtextclassifiers-0.0.1.dist-info → torchtextclassifiers-1.0.0.dist-info}/WHEEL +1 -1
torchTextClassifiers/classifiers/base.py +0 -83
torchTextClassifiers/classifiers/fasttext/__init__.py +0 -25
torchTextClassifiers/classifiers/fasttext/core.py +0 -269
torchTextClassifiers/classifiers/fasttext/model.py +0 -752
torchTextClassifiers/classifiers/fasttext/tokenizer.py +0 -346
torchTextClassifiers/classifiers/fasttext/wrapper.py +0 -216
torchTextClassifiers/classifiers/simple_text_classifier.py +0 -191
torchTextClassifiers/factories.py +0 -34
torchTextClassifiers/utilities/checkers.py +0 -108
torchTextClassifiers/utilities/preprocess.py +0 -82
torchTextClassifiers/utilities/utils.py +0 -346
torchtextclassifiers-0.0.1.dist-info/METADATA +0 -187
torchtextclassifiers-0.0.1.dist-info/RECORD +0 -17

torchTextClassifiers/torchTextClassifiers.py CHANGED Viewed

@@ -1,7 +1,15 @@
 import logging
 import time
-import json
-from typing import Optional, Union, Type, List, Dict, Any
+from dataclasses import asdict, dataclass, field
+from typing import Any, Dict, List, Optional, Tuple, Type, Union
+try:
+    from captum.attr import LayerIntegratedGradients
+    HAS_CAPTUM = True
+except ImportError:
+    HAS_CAPTUM = False
 import numpy as np
 import pytorch_lightning as pl
@@ -12,9 +20,17 @@ from pytorch_lightning.callbacks import (
     ModelCheckpoint,
 )
-from .utilities.checkers import check_X, check_Y, NumpyJSONEncoder
-from .classifiers.base import BaseClassifierConfig, BaseClassifierWrapper
+from torchTextClassifiers.dataset import TextClassificationDataset
+from torchTextClassifiers.model import TextClassificationModel, TextClassificationModule
+from torchTextClassifiers.model.components import (
+    AttentionConfig,
+    CategoricalForwardType,
+    CategoricalVariableNet,
+    ClassificationHead,
+    TextEmbedder,
+    TextEmbedderConfig,
+)
+from torchTextClassifiers.tokenizers import BaseTokenizer, TokenizerOutput
 logger = logging.getLogger(__name__)
@@ -26,484 +42,555 @@ logging.basicConfig(
 )
+@dataclass
+class ModelConfig:
+    """Base configuration class for text classifiers."""
+    embedding_dim: int
+    categorical_vocabulary_sizes: Optional[List[int]] = None
+    categorical_embedding_dims: Optional[Union[List[int], int]] = None
+    num_classes: Optional[int] = None
+    attention_config: Optional[AttentionConfig] = None
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "ModelConfig":
+        return cls(**data)
+@dataclass
+class TrainingConfig:
+    num_epochs: int
+    batch_size: int
+    lr: float
+    loss: torch.nn.Module = field(default_factory=lambda: torch.nn.CrossEntropyLoss())
+    optimizer: Type[torch.optim.Optimizer] = torch.optim.Adam
+    scheduler: Optional[Type[torch.optim.lr_scheduler._LRScheduler]] = None
+    accelerator: str = "auto"
+    num_workers: int = 12
+    patience_early_stopping: int = 3
+    dataloader_params: Optional[dict] = None
+    trainer_params: Optional[dict] = None
+    optimizer_params: Optional[dict] = None
+    scheduler_params: Optional[dict] = None
+    def to_dict(self) -> Dict[str, Any]:
+        data = asdict(self)
+        # Serialize loss and scheduler as their class names
+        data["loss"] = self.loss.__class__.__name__
+        if self.scheduler is not None:
+            data["scheduler"] = self.scheduler.__name__
+        return data
 class torchTextClassifiers:
     """Generic text classifier framework supporting multiple architectures.
-    This is the main class that provides a unified interface for different types
-    of text classifiers. It acts as a high-level wrapper that delegates operations
-    to specific classifier implementations while providing a consistent API.
-    The class supports the full machine learning workflow including:
-    - Building tokenizers from training data
-    - Model training with validation
-    - Prediction and evaluation
-    - Model serialization and loading
-    Attributes:
-        config: Configuration object specific to the classifier type
-        classifier: The underlying classifier implementation
-    Example:
-        >>> from torchTextClassifiers import torchTextClassifiers
-        >>> from torchTextClassifiers.classifiers.fasttext.config import FastTextConfig
-        >>> from torchTextClassifiers.classifiers.fasttext.wrapper import FastTextWrapper
-        >>>
-        >>> # Create configuration
-        >>> config = FastTextConfig(
-        ...     embedding_dim=100,
-        ...     num_tokens=10000,
-        ...     min_count=1,
-        ...     min_n=3,
-        ...     max_n=6,
-        ...     len_word_ngrams=2,
-        ...     num_classes=2
-        ... )
-        >>>
-        >>> # Initialize classifier with wrapper
-        >>> wrapper = FastTextWrapper(config)
-        >>> classifier = torchTextClassifiers(wrapper)
-        >>>
-        >>> # Build and train
-        >>> classifier.build(X_train, y_train)
-        >>> classifier.train(X_train, y_train, X_val, y_val, num_epochs=10, batch_size=32)
-        >>>
-        >>> # Predict
-        >>> predictions = classifier.predict(X_test)
+    Given a tokenizer and model configuration, this class initializes:
+    - Text embedding layer (if needed)
+    - Categorical variable embedding network (if categorical variables are provided)
+    - Classification head
+    The resulting model can be trained using PyTorch Lightning and used for predictions.
     """
-    def __init__(self, classifier: BaseClassifierWrapper):
-        """Initialize the torchTextClassifiers instance.
-        Args:
-            classifier: An instance of a classifier wrapper that implements BaseClassifierWrapper
-        Example:
-            >>> from torchTextClassifiers.classifiers.fasttext.wrapper import FastTextWrapper
-            >>> from torchTextClassifiers.classifiers.fasttext.config import FastTextConfig
-            >>> config = FastTextConfig(embedding_dim=50, num_tokens=5000)
-            >>> wrapper = FastTextWrapper(config)
-            >>> classifier = torchTextClassifiers(wrapper)
-        """
-        self.classifier = classifier
-        self.config = classifier.config
-    def build_tokenizer(self, training_text: np.ndarray) -> None:
-        """Build tokenizer from training text data.
-        This method is kept for backward compatibility. It delegates to
-        prepare_text_features which handles the actual text preprocessing.
-        Args:
-            training_text: Array of text strings to build the tokenizer from
-        Example:
-            >>> import numpy as np
-            >>> texts = np.array(["Hello world", "This is a test", "Another example"])
-            >>> classifier.build_tokenizer(texts)
-        """
-        self.classifier.prepare_text_features(training_text)
-    def prepare_text_features(self, training_text: np.ndarray) -> None:
-        """Prepare text features for the classifier.
-        This method handles text preprocessing which could involve tokenization,
-        vectorization, or other approaches depending on the classifier type.
-        Args:
-            training_text: Array of text strings to prepare features from
-        Example:
-            >>> import numpy as np
-            >>> texts = np.array(["Hello world", "This is a test", "Another example"])
-            >>> classifier.prepare_text_features(texts)
-        """
-        self.classifier.prepare_text_features(training_text)
-    def build(
+    def __init__(
         self,
-        X_train: np.ndarray,
-        y_train: np.ndarray = None,
-        lightning=True,
-        **kwargs
-    ) -> None:
-        """Build the complete classifier from training data.
-        This method handles the full model building process including:
-        - Input validation and preprocessing
-        - Tokenizer creation from training text
-        - Model architecture initialization
-        - Lightning module setup (if enabled)
+        tokenizer: BaseTokenizer,
+        model_config: ModelConfig,
+        ragged_multilabel: bool = False,
+    ):
+        """Initialize the torchTextClassifiers instance.
         Args:
-            X_train: Training input data (text and optional categorical features)
-            y_train: Training labels (optional, can be inferred if num_classes is set)
-            lightning: Whether to initialize PyTorch Lightning components
-            **kwargs: Additional arguments passed to Lightning initialization
-        Raises:
-            ValueError: If y_train is None and num_classes is not set in config
-            ValueError: If label values are outside expected range
+            tokenizer: A tokenizer instance for text preprocessing
+            model_config: Configuration parameters for the text classification model
         Example:
-            >>> X_train = np.array(["text sample 1", "text sample 2"])
-            >>> y_train = np.array([0, 1])
-            >>> classifier.build(X_train, y_train)
+            >>> from torchTextClassifiers import ModelConfig, TrainingConfig, torchTextClassifiers
+            >>>  # Assume tokenizer is a trained BaseTokenizer instance
+            >>> model_config = ModelConfig(
+            ...     embedding_dim=10,
+            ...     categorical_vocabulary_sizes=[30, 25],
+            ...     categorical_embedding_dims=[10, 5],
+            ...     num_classes=10,
+            ... )
+            >>> ttc = torchTextClassifiers(
+            ...     tokenizer=tokenizer,
+            ...     model_config=model_config,
+            ... )
         """
-        training_text, categorical_variables, no_cat_var = check_X(X_train)
-        if y_train is not None:
-            if self.config.num_classes is not None:
-                if self.config.num_classes != len(np.unique(y_train)):
-                    logger.warning(
-                        f"Updating num_classes from {self.config.num_classes} to {len(np.unique(y_train))}"
-                    )
-            y_train = check_Y(y_train)
-            self.config.num_classes = len(np.unique(y_train))
-            if np.max(y_train) >= self.config.num_classes:
-                raise ValueError(
-                    "y_train must contain values between 0 and num_classes-1"
+        self.model_config = model_config
+        self.tokenizer = tokenizer
+        self.ragged_multilabel = ragged_multilabel
+        if hasattr(self.tokenizer, "trained"):
+            if not self.tokenizer.trained:
+                raise RuntimeError(
+                    f"Tokenizer {type(self.tokenizer)} must be trained before initializing the classifier."
                 )
+        self.vocab_size = tokenizer.vocab_size
+        self.embedding_dim = model_config.embedding_dim
+        self.categorical_vocabulary_sizes = model_config.categorical_vocabulary_sizes
+        self.num_classes = model_config.num_classes
+        if self.tokenizer.output_vectorized:
+            self.text_embedder = None
+            logger.info(
+                "Tokenizer outputs vectorized tokens; skipping TextEmbedder initialization."
+            )
+            self.embedding_dim = self.tokenizer.output_dim
         else:
-            if self.config.num_classes is None:
-                raise ValueError(
-                    "Either num_classes must be provided at init or y_train must be provided here."
-                )
-        # Handle categorical variables
-        if not no_cat_var:
-            if hasattr(self.config, 'num_categorical_features') and self.config.num_categorical_features is not None:
-                if self.config.num_categorical_features != categorical_variables.shape[1]:
-                    logger.warning(
-                        f"Updating num_categorical_features from {self.config.num_categorical_features} to {categorical_variables.shape[1]}"
-                    )
-            if hasattr(self.config, 'num_categorical_features'):
-                self.config.num_categorical_features = categorical_variables.shape[1]
-            categorical_vocabulary_sizes = np.max(categorical_variables, axis=0) + 1
-            if hasattr(self.config, 'categorical_vocabulary_sizes') and self.config.categorical_vocabulary_sizes is not None:
-                if self.config.categorical_vocabulary_sizes != list(categorical_vocabulary_sizes):
-                    logger.warning(
-                        "Overwriting categorical_vocabulary_sizes with values from training data."
-                    )
-            if hasattr(self.config, 'categorical_vocabulary_sizes'):
-                self.config.categorical_vocabulary_sizes = list(categorical_vocabulary_sizes)
-        self.classifier.prepare_text_features(training_text)
-        self.classifier._build_pytorch_model()
-        if lightning:
-            self.classifier._check_and_init_lightning(**kwargs)
+            text_embedder_config = TextEmbedderConfig(
+                vocab_size=self.vocab_size,
+                embedding_dim=self.embedding_dim,
+                padding_idx=tokenizer.padding_idx,
+                attention_config=model_config.attention_config,
+            )
+            self.text_embedder = TextEmbedder(
+                text_embedder_config=text_embedder_config,
+            )
+        classif_head_input_dim = self.embedding_dim
+        if self.categorical_vocabulary_sizes:
+            self.categorical_var_net = CategoricalVariableNet(
+                categorical_vocabulary_sizes=self.categorical_vocabulary_sizes,
+                categorical_embedding_dims=model_config.categorical_embedding_dims,
+                text_embedding_dim=self.embedding_dim,
+            )
+            if self.categorical_var_net.forward_type != CategoricalForwardType.SUM_TO_TEXT:
+                classif_head_input_dim += self.categorical_var_net.output_dim
+        else:
+            self.categorical_var_net = None
+        self.classification_head = ClassificationHead(
+            input_dim=classif_head_input_dim,
+            num_classes=model_config.num_classes,
+        )
+        self.pytorch_model = TextClassificationModel(
+            text_embedder=self.text_embedder,
+            categorical_variable_net=self.categorical_var_net,
+            classification_head=self.classification_head,
+        )
     def train(
         self,
         X_train: np.ndarray,
         y_train: np.ndarray,
-        X_val: np.ndarray,
-        y_val: np.ndarray,
-        num_epochs: int,
-        batch_size: int,
-        cpu_run: bool = False,
-        num_workers: int = 12,
-        patience_train: int = 3,
+        training_config: TrainingConfig,
+        X_val: Optional[np.ndarray] = None,
+        y_val: Optional[np.ndarray] = None,
         verbose: bool = False,
-        trainer_params: Optional[dict] = None,
-        **kwargs
     ) -> None:
         """Train the classifier using PyTorch Lightning.
         This method handles the complete training process including:
         - Data validation and preprocessing
         - Dataset and DataLoader creation
         - PyTorch Lightning trainer setup with callbacks
         - Model training with early stopping
         - Best model loading after training
         Args:
             X_train: Training input data
             y_train: Training labels
             X_val: Validation input data
             y_val: Validation labels
-            num_epochs: Maximum number of training epochs
-            batch_size: Batch size for training and validation
-            cpu_run: If True, force training on CPU instead of GPU
-            num_workers: Number of worker processes for data loading
-            patience_train: Number of epochs to wait for improvement before early stopping
-            verbose: If True, print detailed training progress
-            trainer_params: Additional parameters to pass to PyTorch Lightning Trainer
-            **kwargs: Additional arguments passed to the build method
+            training_config: Configuration parameters for training
+            verbose: Whether to print training progress information
         Example:
-            >>> classifier.train(
-            ...     X_train, y_train, X_val, y_val,
-            ...     num_epochs=50,
-            ...     batch_size=32,
-            ...     patience_train=5,
-            ...     verbose=True
-            ... )
+                >>> training_config = TrainingConfig(
+                ...     lr=1e-3,
+                ...     batch_size=4,
+                ...     num_epochs=1,
+                ... )
+                >>> ttc.train(
+                ...     X_train=X,
+                ...     y_train=Y,
+                ...     X_val=X,
+                ...     y_val=Y,
+                ...     training_config=training_config,
+                ... )
         """
         # Input validation
-        training_text, train_categorical_variables, train_no_cat_var = check_X(X_train)
-        val_text, val_categorical_variables, val_no_cat_var = check_X(X_val)
-        y_train = check_Y(y_train)
-        y_val = check_Y(y_val)
-        # Consistency checks
-        assert train_no_cat_var == val_no_cat_var, (
-            "X_train and X_val must have the same number of categorical variables."
-        )
-        assert X_train.shape[0] == y_train.shape[0], (
-            "X_train and y_train must have the same number of observations."
-        )
-        assert X_train.ndim > 1 and X_train.shape[1] == X_val.shape[1] or X_val.ndim == 1, (
-            "X_train and X_val must have the same number of columns."
-        )
+        X_train, y_train = self._check_XY(X_train, y_train)
+        if X_val is not None:
+            assert y_val is not None, "y_val must be provided if X_val is provided."
+        if y_val is not None:
+            assert X_val is not None, "X_val must be provided if y_val is provided."
+        if X_val is not None and y_val is not None:
+            X_val, y_val = self._check_XY(X_val, y_val)
+        if (
+            X_train["categorical_variables"] is not None
+            and X_val["categorical_variables"] is not None
+        ):
+            assert (
+                X_train["categorical_variables"].ndim > 1
+                and X_train["categorical_variables"].shape[1]
+                == X_val["categorical_variables"].shape[1]
+                or X_val["categorical_variables"].ndim == 1
+            ), "X_train and X_val must have the same number of columns."
         if verbose:
             logger.info("Starting training process...")
-        # Device setup
-        if cpu_run:
-            device = torch.device("cpu")
-        else:
+        if training_config.accelerator == "auto":
             device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.classifier.device = device
+        else:
+            device = torch.device(training_config.accelerator)
+        self.device = device
+        optimizer_params = {"lr": training_config.lr}
+        if training_config.optimizer_params is not None:
+            optimizer_params.update(training_config.optimizer_params)
+        if training_config.loss is torch.nn.CrossEntropyLoss and self.ragged_multilabel:
+            logger.warning(
+                "⚠️ You have set ragged_multilabel to True but are using CrossEntropyLoss. We would recommend to use torch.nn.BCEWithLogitsLoss for multilabel classification tasks."
+            )
+        self.lightning_module = TextClassificationModule(
+            model=self.pytorch_model,
+            loss=training_config.loss,
+            optimizer=training_config.optimizer,
+            optimizer_params=optimizer_params,
+            scheduler=training_config.scheduler,
+            scheduler_params=training_config.scheduler_params
+            if training_config.scheduler_params
+            else {},
+            scheduler_interval="epoch",
+        )
+        self.pytorch_model.to(self.device)
         if verbose:
             logger.info(f"Running on: {device}")
-        # Build model if not already built
-        if self.classifier.pytorch_model is None:
-            if verbose:
-                start = time.time()
-                logger.info("Building the model...")
-            self.build(X_train, y_train, **kwargs)
-            if verbose:
-                end = time.time()
-                logger.info(f"Model built in {end - start:.2f} seconds.")
-        self.classifier.pytorch_model = self.classifier.pytorch_model.to(device)
-        # Create datasets and dataloaders using wrapper methods
-        train_dataset = self.classifier.create_dataset(
-            texts=training_text,
-            labels=y_train,
-            categorical_variables=train_categorical_variables,
-        )
-        val_dataset = self.classifier.create_dataset(
-            texts=val_text,
-            labels=y_val,
-            categorical_variables=val_categorical_variables,
-        )
-        train_dataloader = self.classifier.create_dataloader(
-            dataset=train_dataset,
-            batch_size=batch_size,
-            num_workers=num_workers,
-            shuffle=True
+        train_dataset = TextClassificationDataset(
+            texts=X_train["text"],
+            categorical_variables=X_train["categorical_variables"],  # None if no cat vars
+            tokenizer=self.tokenizer,
+            labels=y_train.tolist(),
+            ragged_multilabel=self.ragged_multilabel,
         )
-        val_dataloader = self.classifier.create_dataloader(
-            dataset=val_dataset,
-            batch_size=batch_size,
-            num_workers=num_workers,
-            shuffle=False
+        train_dataloader = train_dataset.create_dataloader(
+            batch_size=training_config.batch_size,
+            num_workers=training_config.num_workers,
+            shuffle=True,
+            **training_config.dataloader_params if training_config.dataloader_params else {},
         )
+        if X_val is not None and y_val is not None:
+            val_dataset = TextClassificationDataset(
+                texts=X_val["text"],
+                categorical_variables=X_val["categorical_variables"],  # None if no cat vars
+                tokenizer=self.tokenizer,
+                labels=y_val,
+                ragged_multilabel=self.ragged_multilabel,
+            )
+            val_dataloader = val_dataset.create_dataloader(
+                batch_size=training_config.batch_size,
+                num_workers=training_config.num_workers,
+                shuffle=False,
+                **training_config.dataloader_params if training_config.dataloader_params else {},
+            )
+        else:
+            val_dataloader = None
         # Setup trainer
         callbacks = [
             ModelCheckpoint(
-                monitor="val_loss",
+                monitor="val_loss" if val_dataloader is not None else "train_loss",
                 save_top_k=1,
                 save_last=False,
                 mode="min",
             ),
             EarlyStopping(
-                monitor="val_loss",
-                patience=patience_train,
+                monitor="val_loss" if val_dataloader is not None else "train_loss",
+                patience=training_config.patience_early_stopping,
                 mode="min",
             ),
             LearningRateMonitor(logging_interval="step"),
         ]
-        train_params = {
+        trainer_params = {
+            "accelerator": training_config.accelerator,
             "callbacks": callbacks,
-            "max_epochs": num_epochs,
+            "max_epochs": training_config.num_epochs,
             "num_sanity_val_steps": 2,
             "strategy": "auto",
             "log_every_n_steps": 1,
             "enable_progress_bar": True,
         }
-        if trainer_params is not None:
-            train_params.update(trainer_params)
-        trainer = pl.Trainer(**train_params)
+        if training_config.trainer_params is not None:
+            trainer_params.update(training_config.trainer_params)
+        trainer = pl.Trainer(**trainer_params)
         torch.cuda.empty_cache()
         torch.set_float32_matmul_precision("medium")
         if verbose:
             logger.info("Launching training...")
             start = time.time()
-        trainer.fit(self.classifier.lightning_module, train_dataloader, val_dataloader)
+        trainer.fit(self.lightning_module, train_dataloader, val_dataloader)
         if verbose:
             end = time.time()
             logger.info(f"Training completed in {end - start:.2f} seconds.")
-        # Load best model using wrapper method
         best_model_path = trainer.checkpoint_callback.best_model_path
-        self.classifier.load_best_model(best_model_path)
-    def predict(self, X: np.ndarray, **kwargs) -> np.ndarray:
-        """Make predictions on input data.
-        Args:
-            X: Input data for prediction (text and optional categorical features)
-            **kwargs: Additional arguments passed to the underlying predictor
-        Returns:
-            np.ndarray: Predicted class labels
-        Example:
-            >>> X_test = np.array(["new text sample", "another sample"])
-            >>> predictions = classifier.predict(X_test)
-            >>> print(predictions)  # [0, 1]
-        """
-        return self.classifier.predict(X, **kwargs)
-    def validate(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        """Validate the model on test data.
-        Args:
-            X: Input data for validation
-            Y: True labels for validation
-            **kwargs: Additional arguments passed to the validator
-        Returns:
-            float: Validation accuracy score
-        Example:
-            >>> accuracy = classifier.validate(X_test, y_test)
-            >>> print(f"Accuracy: {accuracy:.3f}")
-        """
-        return self.classifier.validate(X, Y, **kwargs)
-    def predict_and_explain(self, X: np.ndarray, **kwargs):
-        """Make predictions with explanations (if supported).
-        This method provides both predictions and explanations for the model's
-        decisions. Availability depends on the specific classifier implementation.
+        self.lightning_module = TextClassificationModule.load_from_checkpoint(
+            best_model_path,
+            model=self.pytorch_model,
+            loss=training_config.loss,
+        )
+        self.pytorch_model = self.lightning_module.model.to(self.device)
+        self.lightning_module.eval()
+    def _check_XY(self, X: np.ndarray, Y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        X = self._check_X(X)
+        Y = self._check_Y(Y)
+        if X["text"].shape[0] != len(Y):
+            raise ValueError("X_train and y_train must have the same number of observations.")
+        return X, Y
+    @staticmethod
+    def _check_text_col(X):
+        assert isinstance(
+            X, np.ndarray
+        ), "X must be a numpy array of shape (N,d), with the first column being the text and the rest being the categorical variables."
+        try:
+            if X.ndim > 1:
+                text = X[:, 0].astype(str)
+            else:
+                text = X[:].astype(str)
+        except ValueError:
+            logger.error("The first column of X must be castable in string format.")
+        return text
+    def _check_categorical_variables(self, X: np.ndarray) -> None:
+        """Check if categorical variables in X match training configuration.
         Args:
-            X: Input data for prediction
-            **kwargs: Additional arguments passed to the explainer
-        Returns:
-            tuple: (predictions, explanations) where explanations format depends
-                  on the classifier type
+            X: Input data to check
         Raises:
-            NotImplementedError: If the classifier doesn't support explanations
-        Example:
-            >>> predictions, explanations = classifier.predict_and_explain(X_test)
-            >>> print(f"Predictions: {predictions}")
-            >>> print(f"Explanations: {explanations}")
+            ValueError: If the number of categorical variables does not match
+                        the training configuration
         """
-        if hasattr(self.classifier, 'predict_and_explain'):
-            return self.classifier.predict_and_explain(X, **kwargs)
+        assert self.categorical_var_net is not None
+        if X.ndim > 1:
+            num_cat_vars = X.shape[1] - 1
         else:
-            raise NotImplementedError(f"Explanation not supported for {type(self.classifier).__name__}")
-    def to_json(self, filepath: str) -> None:
-        """Save classifier configuration to JSON file.
-        This method serializes the classifier configuration to a JSON
-        file. Note: This only saves configuration, not trained model weights.
-        Custom classifier wrappers should implement a class method `get_wrapper_class_info()`
-        that returns a dict with 'module' and 'class_name' keys for proper reconstruction.
-        Args:
-            filepath: Path where to save the JSON configuration file
-        Example:
-            >>> classifier.to_json('my_classifier_config.json')
+            num_cat_vars = 0
+        if num_cat_vars != self.categorical_var_net.num_categorical_features:
+            raise ValueError(
+                f"X must have the same number of categorical variables as the number of embedding layers in the categorical net: ({self.categorical_var_net.num_categorical_features})."
+            )
+        try:
+            categorical_variables = X[:, 1:].astype(int)
+        except ValueError:
+            logger.error(
+                f"Columns {1} to {X.shape[1] - 1} of X_train must be castable in integer format."
+            )
+        for j in range(X.shape[1] - 1):
+            max_cat_value = categorical_variables[:, j].max()
+            if max_cat_value >= self.categorical_var_net.categorical_vocabulary_sizes[j]:
+                raise ValueError(
+                    f"Categorical variable at index {j} has value {max_cat_value} which exceeds the vocabulary size of {self.categorical_var_net.categorical_vocabulary_sizes[j]}."
+                )
+        return categorical_variables
+    def _check_X(self, X: np.ndarray) -> np.ndarray:
+        text = self._check_text_col(X)
+        categorical_variables = None
+        if self.categorical_var_net is not None:
+            categorical_variables = self._check_categorical_variables(X)
+        return {"text": text, "categorical_variables": categorical_variables}
+    def _check_Y(self, Y):
+        if self.ragged_multilabel:
+            assert isinstance(
+                Y, list
+            ), "Y must be a list of lists for ragged multilabel classification."
+            for row in Y:
+                assert isinstance(row, list), "Each element of Y must be a list of labels."
+            return Y
+        else:
+            assert isinstance(Y, np.ndarray), "Y must be a numpy array of shape (N,) or (N,1)."
+            assert (
+                len(Y.shape) == 1 or len(Y.shape) == 2
+            ), "Y must be a numpy array of shape (N,) or (N, num_labels)."
+            try:
+                Y = Y.astype(int)
+            except ValueError:
+                logger.error("Y must be castable in integer format.")
+            if Y.max() >= self.num_classes or Y.min() < 0:
+                raise ValueError(
+                    f"Y contains class labels outside the range [0, {self.num_classes - 1}]."
+                )
+            return Y
+    def predict(
+        self,
+        X_test: np.ndarray,
+        top_k=1,
+        explain=False,
+    ):
         """
-        with open(filepath, "w") as f:
-            data = {
-                "config": self.config.to_dict(),
-            }
-            # Try to get wrapper class info for reconstruction
-            if hasattr(self.classifier.__class__, 'get_wrapper_class_info'):
-                data["wrapper_class_info"] = self.classifier.__class__.get_wrapper_class_info()
-            else:
-                # Fallback: store module and class name
-                data["wrapper_class_info"] = {
-                    "module": self.classifier.__class__.__module__,
-                    "class_name": self.classifier.__class__.__name__
-                }
-            json.dump(data, f, cls=NumpyJSONEncoder, indent=4)
-    @classmethod
-    def from_json(cls, filepath: str, wrapper_class: Optional[Type[BaseClassifierWrapper]] = None) -> "torchTextClassifiers":
-        """Load classifier configuration from JSON file.
-        This method creates a new classifier instance from a previously saved
-        configuration file. The classifier will need to be built and trained again.
         Args:
-            filepath: Path to the JSON configuration file
-            wrapper_class: Optional wrapper class to use. If not provided, will try to
-                          reconstruct from saved wrapper_class_info
-        Returns:
-            torchTextClassifiers: New classifier instance with loaded configuration
-        Raises:
-            ImportError: If the wrapper class cannot be imported
-            FileNotFoundError: If the configuration file doesn't exist
-        Example:
-            >>> # Using saved wrapper class info
-            >>> classifier = torchTextClassifiers.from_json('my_classifier_config.json')
-            >>>
-            >>> # Or providing wrapper class explicitly
-            >>> from torchTextClassifiers.classifiers.fasttext.wrapper import FastTextWrapper
-            >>> classifier = torchTextClassifiers.from_json('config.json', FastTextWrapper)
+            X_test (np.ndarray): input data to predict on, shape (N,d) where the first column is text and the rest are categorical variables
+            top_k (int): for each sentence, return the top_k most likely predictions (default: 1)
+            explain (bool): launch gradient integration to have an explanation of the prediction (default: False)
+        Returns: A dictionary containing the following fields:
+                - predictions (torch.Tensor, shape (len(text), top_k)): A tensor containing the top_k most likely codes to the query.
+                - confidence (torch.Tensor, shape (len(text), top_k)): A tensor array containing the corresponding confidence scores.
+                - if explain is True:
+                    - attributions (torch.Tensor, shape (len(text), top_k, seq_len)): A tensor containing the attributions for each token in the text.
         """
-        with open(filepath, "r") as f:
-            data = json.load(f)
-        if wrapper_class is None:
-            # Try to reconstruct wrapper class from saved info
-            if "wrapper_class_info" not in data:
-                raise ValueError("No wrapper_class_info found in config file and no wrapper_class provided")
-            wrapper_info = data["wrapper_class_info"]
-            module_name = wrapper_info["module"]
-            class_name = wrapper_info["class_name"]
-            # Dynamically import the wrapper class
-            import importlib
-            module = importlib.import_module(module_name)
-            wrapper_class = getattr(module, class_name)
-        # Reconstruct config using wrapper class's config class
-        config_class = wrapper_class.get_config_class()
-        config = config_class.from_dict(data["config"])
-        # Create wrapper instance
-        wrapper = wrapper_class(config)
-        return cls(wrapper)
+        if explain:
+            return_offsets_mapping = True  # to be passed to the tokenizer
+            return_word_ids = True
+            if self.pytorch_model.text_embedder is None:
+                raise RuntimeError(
+                    "Explainability is not supported when the tokenizer outputs vectorized text directly. Please use a tokenizer that outputs token IDs."
+                )
+            else:
+                if not HAS_CAPTUM:
+                    raise ImportError(
+                        "Captum is not installed and is required for explainability. Run 'pip install/uv add torchFastText[explainability]'."
+                    )
+                lig = LayerIntegratedGradients(
+                    self.pytorch_model, self.pytorch_model.text_embedder.embedding_layer
+                )  # initialize a Captum layer gradient integrator
+        else:
+            return_offsets_mapping = False
+            return_word_ids = False
+        X_test = self._check_X(X_test)
+        text = X_test["text"]
+        categorical_variables = X_test["categorical_variables"]
+        self.pytorch_model.eval().cpu()
+        tokenize_output = self.tokenizer.tokenize(
+            text.tolist(),
+            return_offsets_mapping=return_offsets_mapping,
+            return_word_ids=return_word_ids,
+        )
+        if not isinstance(tokenize_output, TokenizerOutput):
+            raise TypeError(
+                f"Expected TokenizerOutput, got {type(tokenize_output)} from tokenizer.tokenize method."
+            )
+        encoded_text = tokenize_output.input_ids  # (batch_size, seq_len)
+        attention_mask = tokenize_output.attention_mask  # (batch_size, seq_len)
+        if categorical_variables is not None:
+            categorical_vars = torch.tensor(
+                categorical_variables, dtype=torch.float32
+            )  # (batch_size, num_categorical_features)
+        else:
+            categorical_vars = torch.empty((encoded_text.shape[0], 0), dtype=torch.float32)
+        pred = self.pytorch_model(
+            encoded_text, attention_mask, categorical_vars
+        )  # forward pass, contains the prediction scores (len(text), num_classes)
+        label_scores = pred.detach().cpu().softmax(dim=1)  # convert to probabilities
+        label_scores_topk = torch.topk(label_scores, k=top_k, dim=1)
+        predictions = label_scores_topk.indices  # get the top_k most likely predictions
+        confidence = torch.round(label_scores_topk.values, decimals=2)  # and their scores
+        if explain:
+            all_attributions = []
+            for k in range(top_k):
+                attributions = lig.attribute(
+                    (encoded_text, attention_mask, categorical_vars),
+                    target=torch.Tensor(predictions[:, k]).long(),
+                )  # (batch_size, seq_len)
+                attributions = attributions.sum(dim=-1)
+                all_attributions.append(attributions.detach().cpu())
+            all_attributions = torch.stack(all_attributions, dim=1)  # (batch_size, top_k, seq_len)
+            return {
+                "prediction": predictions,
+                "confidence": confidence,
+                "attributions": all_attributions,
+                "offset_mapping": tokenize_output.offset_mapping,
+                "word_ids": tokenize_output.word_ids,
+            }
+        else:
+            return {
+                "prediction": predictions,
+                "confidence": confidence,
+            }
+    def __repr__(self):
+        model_type = (
+            self.lightning_module.__repr__()
+            if hasattr(self, "lightning_module")
+            else self.pytorch_model.__repr__()
+        )
+        tokenizer_info = self.tokenizer.__repr__()
+        cat_forward_type = (
+            self.categorical_var_net.forward_type.name
+            if self.categorical_var_net is not None
+            else "None"
+        )
+        lines = [
+            "torchTextClassifiers(",
+            f"  tokenizer = {tokenizer_info},",
+            f"  model = {model_type},",
+            f"  categorical_forward_type = {cat_forward_type},",
+            f"  num_classes = {self.model_config.num_classes},",
+            f"  embedding_dim = {self.embedding_dim},",
+            ")",
+        ]
+        return "\n".join(lines)

torchtextclassifiers 0.0.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

torchtextclassifiers 0.0.1py3-none-any.whl → 1.0.0py3-none-any.whl