PyPI - torchtextclassifiers - Versions diffs - 0.0.1__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

torchtextclassifiers 0.0.1py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

torchTextClassifiers/__init__.py +12 -48
torchTextClassifiers/dataset/__init__.py +1 -0
torchTextClassifiers/dataset/dataset.py +152 -0
torchTextClassifiers/model/__init__.py +2 -0
torchTextClassifiers/model/components/__init__.py +12 -0
torchTextClassifiers/model/components/attention.py +126 -0
torchTextClassifiers/model/components/categorical_var_net.py +128 -0
torchTextClassifiers/model/components/classification_head.py +61 -0
torchTextClassifiers/model/components/text_embedder.py +220 -0
torchTextClassifiers/model/lightning.py +170 -0
torchTextClassifiers/model/model.py +151 -0
torchTextClassifiers/tokenizers/WordPiece.py +92 -0
torchTextClassifiers/tokenizers/__init__.py +10 -0
torchTextClassifiers/tokenizers/base.py +205 -0
torchTextClassifiers/tokenizers/ngram.py +472 -0
torchTextClassifiers/torchTextClassifiers.py +500 -413
torchTextClassifiers/utilities/__init__.py +0 -3
torchTextClassifiers/utilities/plot_explainability.py +184 -0
torchtextclassifiers-1.0.0.dist-info/METADATA +87 -0
torchtextclassifiers-1.0.0.dist-info/RECORD +21 -0
{torchtextclassifiers-0.0.1.dist-info → torchtextclassifiers-1.0.0.dist-info}/WHEEL +1 -1
torchTextClassifiers/classifiers/base.py +0 -83
torchTextClassifiers/classifiers/fasttext/__init__.py +0 -25
torchTextClassifiers/classifiers/fasttext/core.py +0 -269
torchTextClassifiers/classifiers/fasttext/model.py +0 -752
torchTextClassifiers/classifiers/fasttext/tokenizer.py +0 -346
torchTextClassifiers/classifiers/fasttext/wrapper.py +0 -216
torchTextClassifiers/classifiers/simple_text_classifier.py +0 -191
torchTextClassifiers/factories.py +0 -34
torchTextClassifiers/utilities/checkers.py +0 -108
torchTextClassifiers/utilities/preprocess.py +0 -82
torchTextClassifiers/utilities/utils.py +0 -346
torchtextclassifiers-0.0.1.dist-info/METADATA +0 -187
torchtextclassifiers-0.0.1.dist-info/RECORD +0 -17

torchTextClassifiers/classifiers/fasttext/tokenizer.py DELETED Viewed

@@ -1,346 +0,0 @@
-"""
-NGramTokenizer class.
-"""
-import ctypes
-import json
-from typing import List, Tuple, Type, Dict
-import numpy as np
-import torch
-from torch import Tensor
-from concurrent.futures import ThreadPoolExecutor
-from dataclasses import dataclass
-from queue import Queue
-import multiprocessing
-from ...utilities.preprocess import clean_text_feature
-class NGramTokenizer:
-    """
-    NGramTokenizer class.
-    """
-    def __init__(
-        self,
-        min_count: int,
-        min_n: int,
-        max_n: int,
-        num_tokens: int,
-        len_word_ngrams: int,
-        training_text: List[str],
-        **kwargs,
-    ):
-        """
-        Constructor for the NGramTokenizer class.
-        Args:
-            min_count (int): Minimum number of times a word has to be
-                in the training data to be given an embedding.
-            min_n (int): Minimum length of character n-grams.
-            max_n (int): Maximum length of character n-grams.
-            num_tokens (int): Number of rows in the embedding matrix.
-            word_ngrams (int): Maximum length of word n-grams.
-            training_text (List[str]): List of training texts.
-        Raises:
-            ValueError: If `min_n` is 1 or smaller.
-            ValueError: If `max_n` is 7 or higher.
-        """
-        if min_n < 2:
-            raise ValueError("`min_n` parameter must be greater than 1.")
-        if max_n > 6:
-            raise ValueError("`max_n` parameter must be smaller than 7.")
-        self.min_count = min_count
-        self.min_n = min_n
-        self.max_n = max_n
-        self.num_tokens = num_tokens
-        self.word_ngrams = len_word_ngrams
-        word_counts = {}
-        for sentence in training_text:
-            for word in sentence.split(" "):
-                word_counts[word] = word_counts.setdefault(word, 0) + 1
-        self.word_id_mapping = {}
-        i = 1
-        for word, counts in word_counts.items():
-            if word_counts[word] >= min_count:
-                self.word_id_mapping[word] = i
-                i += 1
-        self.nwords = len(self.word_id_mapping)
-        self.padding_index = self.num_tokens + self.get_nwords()
-    def __str__(self) -> str:
-        """
-        Returns description of the NGramTokenizer.
-        Returns:
-            str: Description.
-        """
-        return f"<NGramTokenizer(min_n={self.min_n}, max_n={self.max_n}, num_tokens={self.num_tokens}, word_ngrams={self.word_ngrams}, nwords={self.nwords})>"
-    def get_nwords(self) -> int:
-        """
-        Return number of words kept in training data.
-        Returns:
-            int: Number of words.
-        """
-        return self.nwords
-    def get_buckets(self) -> int:
-        """
-        Return number of buckets for tokenizer.
-        Returns:
-            int: Number of buckets.
-        """
-        return self.num_tokens
-    @staticmethod
-    def get_ngram_list(word: str, n: int) -> List[str]:
-        """
-        Return the list of character n-grams for a word with a
-        given n.
-        Args:
-            word (str): Word.
-            n (int): Length of the n-grams.
-        Returns:
-            List[str]: List of character n-grams.
-        """
-        return [word[i : i + n] for i in range(len(word) - n + 1)]
-    @staticmethod
-    def get_hash(subword: str) -> int:
-        """
-        Return hash for a given subword.
-        Args:
-            subword (str): Character n-gram.
-        Returns:
-            int: Corresponding hash.
-        """
-        h = ctypes.c_uint32(2166136261).value
-        for c in subword:
-            c = ctypes.c_int8(ord(c)).value
-            h = ctypes.c_uint32(h ^ c).value
-            h = ctypes.c_uint32(h * 16777619).value
-        return h
-    @staticmethod
-    def get_word_ngram_id(hashes: Tuple[int], bucket: int, nwords: int) -> int:
-        """
-        Get word ngram index in the embedding matrix.
-        Args:
-            hashes (Tuple[int]): Word hashes.
-            bucket (int): Number of rows in embedding matrix.
-            nwords (int): Number of words in the vocabulary.
-        Returns:
-            int: Word ngram hash.
-        """
-        hashes = [ctypes.c_int32(hash_value).value for hash_value in hashes]
-        h = ctypes.c_uint64(hashes[0]).value
-        for j in range(1, len(hashes)):
-            h = ctypes.c_uint64((h * 116049371)).value
-            h = ctypes.c_uint64(h + hashes[j]).value
-        return h % bucket + nwords
-    def get_subword_index(self, subword: str) -> int:
-        """
-        Return the row index from the embedding matrix which
-        corresponds to a character n-gram.
-        Args:
-            subword (str): Character n-gram.
-        Returns:
-            int: Index.
-        """
-        return self.get_hash(subword) % self.num_tokens + self.nwords
-    def get_word_index(self, word: str) -> int:
-        """
-        Return the row index from the embedding matrix which
-        corresponds to a word.
-        Args:
-            word (str): Word.
-        Returns:
-            int: Index.
-        """
-        return self.word_id_mapping[word]
-    def get_subwords(self, word: str) -> Tuple[List[str], List[int]]:
-        """
-        Return all subwords tokens and indices for a given word.
-        Also adds the whole word token and indice if the word is in word_id_mapping
-        (==> the word is in initial vocabulary + seen at least MIN_COUNT times).
-        Adds tags "<" and ">" to the word.
-        Args:
-            word (str): Word.
-        Returns:
-            Tuple[List[str], List[int]]: Tuple of tokens and indices.
-        """
-        tokens = []
-        word_with_tags = "<" + word + ">"
-        # Get subwords and associated indices WITHOUT the whole word
-        for n in range(self.min_n, self.max_n + 1):
-            ngrams = self.get_ngram_list(word_with_tags, n)
-            tokens += [
-                ngram for ngram in ngrams if ngram != word_with_tags and ngram != word
-            ]  # Exclude the full word
-        indices = [self.get_subword_index(token) for token in tokens]
-        assert word not in tokens
-        # Add word token and indice only if the word is in word_id_mapping
-        if word in self.word_id_mapping.keys():
-            self.get_word_index(word)
-            tokens = [word] + tokens
-            indices = [self.get_word_index(word)] + indices
-        return (tokens, indices)
-    def indices_matrix(self, sentence: str) -> tuple[torch.Tensor, dict, dict]:
-        """
-        Returns an array of token indices for a text description.
-        Args:
-            sentence (str): Text description.
-        Returns:
-            tuple: (torch.Tensor of indices, id_to_token dict, token_to_id dict)
-        """
-        # Pre-split the sentence once
-        words = sentence.split()
-        words.append("</s>")  # Add end of string token
-        indices = []
-        all_tokens_id = {}
-        # Process subwords in one batch
-        for word in words[:-1]:  # Exclude </s> from subword processing
-            tokens, ind = self.get_subwords(word)
-            indices.extend(ind)
-            # Update dictionary with zip for efficiency
-            all_tokens_id.update(zip(tokens, ind))
-        # Add </s> token
-        indices.append(0)
-        all_tokens_id["</s>"] = 0
-        # Compute word n-grams more efficiently
-        if self.word_ngrams > 1:
-            # Pre-compute hashes for all words to avoid repeated computation
-            word_hashes = [self.get_hash(word) for word in words]
-            # Generate n-grams using sliding window
-            word_ngram_ids = []
-            for n in range(2, self.word_ngrams + 1):
-                for i in range(len(words) - n + 1):
-                    # Get slice of hashes for current n-gram
-                    gram_hashes = tuple(word_hashes[i : i + n])
-                    # Compute n-gram ID
-                    word_ngram_id = int(
-                        self.get_word_ngram_id(gram_hashes, self.num_tokens, self.nwords)
-                    )
-                    # Store gram and its ID
-                    gram = " ".join(words[i : i + n])
-                    all_tokens_id[gram] = word_ngram_id
-                    word_ngram_ids.append(word_ngram_id)
-            # Extend indices with n-gram IDs
-            indices.extend(word_ngram_ids)
-        # Create reverse mapping once at the end
-        id_to_token = {v: k for k, v in all_tokens_id.items()}
-        # Convert to tensor directly
-        return torch.tensor(indices, dtype=torch.long), id_to_token, all_tokens_id
-    def tokenize(self, text: list[str], text_tokens=True, preprocess=False):
-        """
-        Tokenize a list of sentences.
-        Args:
-            text (list[str]): List of sentences.
-            text_tokens (bool): If True, return tokenized text in tokens.
-            preprocess (bool): If True, preprocess text. Needs unidecode library.
-        Returns:
-            np.array: Array of indices.
-        """
-        if preprocess:
-            text = clean_text_feature(text)
-        tokenized_text = []
-        id_to_token_dicts = []
-        token_to_id_dicts = []
-        for sentence in text:
-            all_ind, id_to_token, token_to_id = self.indices_matrix(
-                sentence
-            )  # tokenize and convert to token indices
-            tokenized_text.append(all_ind)
-            id_to_token_dicts.append(id_to_token)
-            token_to_id_dicts.append(token_to_id)
-        if text_tokens:
-            tokenized_text_tokens = self._tokenized_text_in_tokens(
-                tokenized_text, id_to_token_dicts
-            )
-            return tokenized_text_tokens, tokenized_text, id_to_token_dicts, token_to_id_dicts
-        else:
-            return tokenized_text, id_to_token_dicts, token_to_id_dicts
-    def _tokenized_text_in_tokens(self, tokenized_text, id_to_token_dicts):
-        """
-        Convert tokenized text in int format to tokens in str format (given a mapping dictionary).
-        Private method. Used in tokenizer.tokenize and pytorch_model.predict()
-        Args:
-            tokenized_text (list): List of tokenized text in int format.
-            id_to_token_dicts (list[Dict]): List of dictionaries mapping token indices to tokens.
-            Both lists have the same length (number of sentences).
-        Returns:
-            list[list[str]]: List of tokenized text in str format.
-        """
-        return [
-            [
-                id_to_token_dicts[i][token_id.item()]
-                for token_id in tokenized_sentence
-                if token_id.item() not in {self.padding_index}
-            ]
-            for i, tokenized_sentence in enumerate(tokenized_text)
-        ]
-    def get_vocab(self):
-        return self.word_id_mapping
-    @classmethod
-    def from_json(cls: Type["NGramTokenizer"], filepath: str, training_text) -> "NGramTokenizer":
-        """
-        Load a dataclass instance from a JSON file.
-        """
-        with open(filepath, "r") as f:
-            data = json.load(f)
-        return cls(**data, training_text=training_text)

torchTextClassifiers/classifiers/fasttext/wrapper.py DELETED Viewed

@@ -1,216 +0,0 @@
-from typing import Optional
-from ..base import BaseClassifierWrapper
-from .core import FastTextConfig
-from .tokenizer import NGramTokenizer
-from .model import FastTextModel, FastTextModule, FastTextModelDataset
-from ...utilities.checkers import check_X, check_Y
-import logging
-import numpy as np
-import torch
-from torch.optim import SGD, Adam
-logger = logging.getLogger()
-class FastTextWrapper(BaseClassifierWrapper):
-    """Wrapper for FastText classifier."""
-    def __init__(self, config: FastTextConfig):
-        super().__init__(config)
-        self.config: FastTextConfig = config
-        self.tokenizer: Optional[NGramTokenizer] = None  # FastText-specific tokenizer
-    def prepare_text_features(self, training_text: np.ndarray) -> None:
-        """Build NGram tokenizer for FastText."""
-        self.tokenizer = NGramTokenizer(
-            self.config.min_count,
-            self.config.min_n,
-            self.config.max_n,
-            self.config.num_tokens,
-            self.config.len_word_ngrams,
-            training_text,
-        )
-    def build_tokenizer(self, training_text: np.ndarray) -> None:
-        """Legacy method for backward compatibility."""
-        self.prepare_text_features(training_text)
-    def _build_pytorch_model(self) -> None:
-        """Build FastText PyTorch model."""
-        if self.config.num_rows is None:
-            if self.tokenizer is None:
-                raise ValueError(
-                    "Please provide a tokenizer or num_rows."
-                )
-            else:
-                self.config.num_rows = self.tokenizer.padding_index + 1
-        else:
-            if self.tokenizer is not None:
-                if self.config.num_rows != self.tokenizer.padding_index + 1:
-                    logger.warning(
-                        f"Divergent values for num_rows: {self.config.num_rows} and {self.tokenizer.padding_index + 1}. "
-                        f"Using max value."
-                    )
-                self.config.num_rows = max(self.config.num_rows, self.tokenizer.padding_index + 1)
-        self.padding_idx = self.config.num_rows - 1
-        # Update tokenizer padding index if necessary
-        if self.tokenizer is not None and self.padding_idx != self.tokenizer.padding_index:
-            self.tokenizer.padding_index = self.padding_idx
-        self.pytorch_model = FastTextModel(
-            tokenizer=self.tokenizer,
-            embedding_dim=self.config.embedding_dim,
-            num_rows=self.config.num_rows,
-            num_classes=self.config.num_classes,
-            categorical_vocabulary_sizes=self.config.categorical_vocabulary_sizes,
-            categorical_embedding_dims=self.config.categorical_embedding_dims,
-            padding_idx=self.padding_idx,
-            sparse=self.config.sparse,
-            direct_bagging=self.config.direct_bagging,
-        )
-    def _check_and_init_lightning(
-        self,
-        optimizer=None,
-        optimizer_params=None,
-        lr=None,
-        scheduler=torch.optim.lr_scheduler.ReduceLROnPlateau,
-        scheduler_params=None,
-        patience_scheduler=3,
-        loss=torch.nn.CrossEntropyLoss(),
-    ) -> None:
-        """Initialize Lightning module for FastText."""
-        if optimizer is None:
-            if lr is None:
-                lr = getattr(self.config, 'learning_rate', 4e-3)  # Use config or default
-            self.optimizer = SGD if self.config.sparse else Adam
-            self.optimizer_params = {"lr": lr}
-        else:
-            self.optimizer = optimizer
-            if optimizer_params is None:
-                if lr is not None:
-                    self.optimizer_params = {"lr": lr}
-                else:
-                    logger.warning("No optimizer parameters provided. Using defaults.")
-                    self.optimizer_params = {}
-        self.scheduler = scheduler
-        if scheduler_params is None:
-            logger.warning("No scheduler parameters provided. Using defaults.")
-            self.scheduler_params = {
-                "mode": "min",
-                "patience": patience_scheduler,
-            }
-        else:
-            self.scheduler_params = scheduler_params
-        self.loss = loss
-        self.lightning_module = FastTextModule(
-            model=self.pytorch_model,
-            loss=self.loss,
-            optimizer=self.optimizer,
-            optimizer_params=self.optimizer_params,
-            scheduler=self.scheduler,
-            scheduler_params=self.scheduler_params,
-            scheduler_interval="epoch",
-        )
-    def predict(self, X: np.ndarray, top_k=1, preprocess=False, verbose=False) -> np.ndarray:
-        """Make predictions with FastText model."""
-        if not self.trained:
-            raise Exception("Model must be trained first.")
-        text, categorical_variables, no_cat_var = check_X(X)
-        if categorical_variables is not None:
-            if categorical_variables.shape[1] != self.config.num_categorical_features:
-                raise Exception(
-                    f"X must have the same number of categorical variables as training data."
-                )
-        else:
-            assert self.pytorch_model.no_cat_var == True
-        predictions, confidence = self.pytorch_model.predict(
-            text, categorical_variables, top_k=top_k, preprocess=preprocess
-        )
-        # Return just predictions, squeeze out the top_k dimension if top_k=1
-        if top_k == 1:
-            predictions = predictions.squeeze(-1)
-        # Convert to numpy array for consistency
-        if hasattr(predictions, 'numpy'):
-            predictions = predictions.numpy()
-        return predictions
-    def validate(self, X: np.ndarray, Y: np.ndarray, batch_size=256, num_workers=12) -> float:
-        """Validate FastText model."""
-        if not self.trained:
-            raise Exception("Model must be trained first.")
-        # Use predict method which handles input validation and returns just predictions
-        predictions = self.predict(X)
-        y = check_Y(Y)
-        # Convert predictions to numpy if it's a tensor
-        if hasattr(predictions, 'numpy'):
-            predictions = predictions.numpy()
-        # Calculate accuracy
-        accuracy = (predictions == y).mean()
-        return float(accuracy)
-    def predict_and_explain(self, X: np.ndarray, top_k=1):
-        """Predict and explain with FastText model."""
-        if not self.trained:
-            raise Exception("Model must be trained first.")
-        text, categorical_variables, no_cat_var = check_X(X)
-        if categorical_variables is not None:
-            if categorical_variables.shape[1] != self.config.num_categorical_features:
-                raise Exception(
-                    f"X must have the same number of categorical variables as training data ({self.config.num_categorical_features})."
-                )
-        else:
-            assert self.pytorch_model.no_cat_var == True
-        return self.pytorch_model.predict_and_explain(text, categorical_variables, top_k=top_k)
-    def create_dataset(self, texts: np.ndarray, labels: np.ndarray, categorical_variables: np.ndarray = None):
-        """Create FastText dataset."""
-        return FastTextModelDataset(
-            categorical_variables=categorical_variables,
-            texts=texts,
-            outputs=labels,
-            tokenizer=self.tokenizer,
-        )
-    def create_dataloader(self, dataset, batch_size: int, num_workers: int = 0, shuffle: bool = True):
-        """Create FastText dataloader."""
-        return dataset.create_dataloader(batch_size=batch_size, num_workers=num_workers, shuffle=shuffle)
-    def load_best_model(self, checkpoint_path: str) -> None:
-        """Load best FastText model from checkpoint."""
-        self.lightning_module = FastTextModule.load_from_checkpoint(
-            checkpoint_path,
-            model=self.pytorch_model,
-            loss=self.loss,
-            optimizer=self.optimizer,
-            optimizer_params=self.optimizer_params,
-            scheduler=self.scheduler,
-            scheduler_params=self.scheduler_params,
-            scheduler_interval="epoch",
-        )
-        self.pytorch_model = self.lightning_module.model.to("cpu")
-        self.trained = True
-        self.pytorch_model.eval()
-    @classmethod
-    def get_config_class(cls):
-        """Return the configuration class for FastText wrapper."""
-        return FastTextConfig

torchtextclassifiers 0.0.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

torchtextclassifiers 0.0.1py3-none-any.whl → 1.0.0py3-none-any.whl