PyPI - torchtextclassifiers - Versions diffs - 0.0.1__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

torchtextclassifiers 0.0.1py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

torchTextClassifiers/__init__.py +12 -48
torchTextClassifiers/dataset/__init__.py +1 -0
torchTextClassifiers/dataset/dataset.py +152 -0
torchTextClassifiers/model/__init__.py +2 -0
torchTextClassifiers/model/components/__init__.py +12 -0
torchTextClassifiers/model/components/attention.py +126 -0
torchTextClassifiers/model/components/categorical_var_net.py +128 -0
torchTextClassifiers/model/components/classification_head.py +61 -0
torchTextClassifiers/model/components/text_embedder.py +220 -0
torchTextClassifiers/model/lightning.py +170 -0
torchTextClassifiers/model/model.py +151 -0
torchTextClassifiers/tokenizers/WordPiece.py +92 -0
torchTextClassifiers/tokenizers/__init__.py +10 -0
torchTextClassifiers/tokenizers/base.py +205 -0
torchTextClassifiers/tokenizers/ngram.py +472 -0
torchTextClassifiers/torchTextClassifiers.py +500 -413
torchTextClassifiers/utilities/__init__.py +0 -3
torchTextClassifiers/utilities/plot_explainability.py +184 -0
torchtextclassifiers-1.0.0.dist-info/METADATA +87 -0
torchtextclassifiers-1.0.0.dist-info/RECORD +21 -0
{torchtextclassifiers-0.0.1.dist-info → torchtextclassifiers-1.0.0.dist-info}/WHEEL +1 -1
torchTextClassifiers/classifiers/base.py +0 -83
torchTextClassifiers/classifiers/fasttext/__init__.py +0 -25
torchTextClassifiers/classifiers/fasttext/core.py +0 -269
torchTextClassifiers/classifiers/fasttext/model.py +0 -752
torchTextClassifiers/classifiers/fasttext/tokenizer.py +0 -346
torchTextClassifiers/classifiers/fasttext/wrapper.py +0 -216
torchTextClassifiers/classifiers/simple_text_classifier.py +0 -191
torchTextClassifiers/factories.py +0 -34
torchTextClassifiers/utilities/checkers.py +0 -108
torchTextClassifiers/utilities/preprocess.py +0 -82
torchTextClassifiers/utilities/utils.py +0 -346
torchtextclassifiers-0.0.1.dist-info/METADATA +0 -187
torchtextclassifiers-0.0.1.dist-info/RECORD +0 -17

torchTextClassifiers/__init__.py CHANGED Viewed

@@ -11,58 +11,22 @@ Key Features:
 - Extensible architecture for adding new classifier types
 - Support for both text-only and mixed text/categorical features
-Quick Start:
-    >>> from torchTextClassifiers import create_fasttext
-    >>> import numpy as np
-    >>>
-    >>> # Create classifier
-    >>> classifier = create_fasttext(
-    ...     embedding_dim=100,
-    ...     sparse=False,
-    ...     num_tokens=10000,
-    ...     min_count=2,
-    ...     min_n=3,
-    ...     max_n=6,
-    ...     len_word_ngrams=2,
-    ...     num_classes=2
-    ... )
-    >>>
-    >>> # Prepare data
-    >>> X_train = np.array(["positive text", "negative text"])
-    >>> y_train = np.array([1, 0])
-    >>> X_val = np.array(["validation text"])
-    >>> y_val = np.array([1])
-    >>>
-    >>> # Build and train
-    >>> classifier.build(X_train, y_train)
-    >>> classifier.train(X_train, y_train, X_val, y_val, num_epochs=10, batch_size=32)
-    >>>
-    >>> # Predict
-    >>> predictions = classifier.predict(np.array(["new text sample"]))
 """
-from .torchTextClassifiers import torchTextClassifiers
-# Convenience imports for FastText
-try:
-    from .classifiers.fasttext.core import FastTextFactory
-    # Expose FastText convenience methods at package level for easy access
-    create_fasttext = FastTextFactory.create_fasttext
-    build_fasttext_from_tokenizer = FastTextFactory.build_from_tokenizer
-except ImportError:
-    # FastText module not available - define placeholder functions
-    def create_fasttext(*args, **kwargs):
-        raise ImportError("FastText module not available")
-    def build_fasttext_from_tokenizer(*args, **kwargs):
-        raise ImportError("FastText module not available")
+from .torchTextClassifiers import (
+    ModelConfig as ModelConfig,
+)
+from .torchTextClassifiers import (
+    TrainingConfig as TrainingConfig,
+)
+from .torchTextClassifiers import (
+    torchTextClassifiers as torchTextClassifiers,
+)
 __all__ = [
     "torchTextClassifiers",
-    "create_fasttext",
-    "build_fasttext_from_tokenizer",
+    "ModelConfig",
+    "TrainingConfig",
 ]
-__version__ = "1.0.0"
+__version__ = "1.0.0"

torchTextClassifiers/dataset/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .dataset import TextClassificationDataset as TextClassificationDataset

torchTextClassifiers/dataset/dataset.py ADDED Viewed

@@ -0,0 +1,152 @@
+import logging
+import os
+from typing import List, Union
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, Dataset
+from torchTextClassifiers.tokenizers import BaseTokenizer
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+logger = logging.getLogger(__name__)
+class TextClassificationDataset(Dataset):
+    def __init__(
+        self,
+        texts: List[str],
+        categorical_variables: Union[List[List[int]], np.array, None],
+        tokenizer: BaseTokenizer,
+        labels: Union[List[int], List[List[int]], np.array, None] = None,
+        ragged_multilabel: bool = False,
+    ):
+        self.categorical_variables = categorical_variables
+        self.texts = texts
+        if hasattr(tokenizer, "trained") and not tokenizer.trained:
+            raise RuntimeError(
+                f"Tokenizer {type(tokenizer)} must be trained before creating dataset."
+            )
+        self.tokenizer = tokenizer
+        self.texts = texts
+        self.tokenizer = tokenizer
+        self.labels = labels
+        self.ragged_multilabel = ragged_multilabel
+        if self.ragged_multilabel and self.labels is not None:
+            max_value = int(max(max(row) for row in labels if row))
+            self.num_classes = max_value + 1
+            if max_value == 1:
+                try:
+                    labels = np.array(labels)
+                    logger.critical(
+                        """ragged_multilabel set to True but max label value is 1 and all samples have the same number of labels.
+                        If your labels are already one-hot encoded, set ragged_multilabel to False. Otherwise computations are likely to be wrong."""
+                    )
+                except ValueError:
+                    logger.warning(
+                        "ragged_multilabel set to True but max label value is 1. If your labels are already one-hot encoded, set ragged_multilabel to False. Otherwise computations are likely to be wrong."
+                    )
+    def __len__(self):
+        return len(self.texts)
+    def __getitem__(self, idx):
+        if self.labels is not None:
+            return (
+                str(self.texts[idx]),
+                (
+                    self.categorical_variables[idx]
+                    if self.categorical_variables is not None
+                    else None
+                ),
+                self.labels[idx],
+            )
+        else:
+            return (
+                str(self.texts[idx]),
+                (
+                    self.categorical_variables[idx]
+                    if self.categorical_variables is not None
+                    else None
+                ),
+                None,
+            )
+    def collate_fn(self, batch):
+        text, *categorical_vars, labels = zip(*batch)
+        if self.labels is not None:
+            if self.ragged_multilabel:
+                # Pad labels to the max length in the batch
+                labels_padded = torch.nn.utils.rnn.pad_sequence(
+                    [torch.tensor(label) for label in labels],
+                    batch_first=True,
+                    padding_value=-1,  # use impossible class
+                ).int()
+                labels_tensor = torch.zeros(labels_padded.size(0), 6).float()
+                mask = labels_padded != -1
+                batch_size = labels_padded.size(0)
+                rows = torch.arange(batch_size).unsqueeze(1).expand_as(labels_padded)[mask]
+                cols = labels_padded[mask]
+                labels_tensor[rows, cols] = 1
+            else:
+                labels_tensor = torch.tensor(labels)
+        else:
+            labels_tensor = None
+        tokenize_output = self.tokenizer.tokenize(list(text))
+        if self.categorical_variables is not None:
+            categorical_tensors = torch.stack(
+                [
+                    torch.tensor(cat_var, dtype=torch.float32)
+                    for cat_var in categorical_vars[
+                        0
+                    ]  # Access first element since zip returns tuple
+                ]
+            )
+        else:
+            categorical_tensors = None
+        return {
+            "input_ids": tokenize_output.input_ids,
+            "attention_mask": tokenize_output.attention_mask,
+            "categorical_vars": categorical_tensors,
+            "labels": labels_tensor,
+        }
+    def create_dataloader(
+        self,
+        batch_size: int,
+        shuffle: bool = False,
+        drop_last: bool = False,
+        num_workers: int = os.cpu_count() - 1,
+        pin_memory: bool = False,
+        persistent_workers: bool = True,
+        **kwargs,
+    ):
+        # persistent_workers requires num_workers > 0
+        if num_workers == 0:
+            persistent_workers = False
+        return DataLoader(
+            dataset=self,
+            batch_size=batch_size,
+            collate_fn=self.collate_fn,
+            shuffle=shuffle,
+            drop_last=drop_last,
+            pin_memory=pin_memory,
+            num_workers=num_workers,
+            persistent_workers=persistent_workers,
+            **kwargs,
+        )

torchTextClassifiers/model/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .lightning import TextClassificationModule as TextClassificationModule
2	+ from .model import TextClassificationModel as TextClassificationModel

torchTextClassifiers/model/components/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+from .attention import (
+    AttentionConfig as AttentionConfig,
+)
+from .categorical_var_net import (
+    CategoricalForwardType as CategoricalForwardType,
+)
+from .categorical_var_net import (
+    CategoricalVariableNet as CategoricalVariableNet,
+)
+from .classification_head import ClassificationHead as ClassificationHead
+from .text_embedder import TextEmbedder as TextEmbedder
+from .text_embedder import TextEmbedderConfig as TextEmbedderConfig

torchTextClassifiers/model/components/attention.py ADDED Viewed

@@ -0,0 +1,126 @@
+"""Largely inspired from Andrej Karpathy's nanochat, see here https://github.com/karpathy/nanochat/blob/master/nanochat/gpt.py"""
+from dataclasses import dataclass
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+### Some utils used in text_embedder.py for the attention blocks ###
+def apply_rotary_emb(x, cos, sin):
+    assert x.ndim == 4  # multihead attention
+    d = x.shape[3] // 2
+    x1, x2 = x[..., :d], x[..., d:]  # split up last time into two halves
+    y1 = x1 * cos + x2 * sin  # rotate pairs of dims
+    y2 = x1 * (-sin) + x2 * cos
+    out = torch.cat([y1, y2], 3)  # re-assemble
+    out = out.to(x.dtype)  # ensure input/output dtypes match
+    return out
+def norm(x):
+    # Purely functional rmsnorm with no learnable params
+    return F.rms_norm(x, (x.size(-1),))
+#### Config #####
+@dataclass
+class AttentionConfig:
+    n_layers: int
+    n_head: int
+    n_kv_head: int
+    sequence_len: Optional[int] = None
+    positional_encoding: bool = True
+    aggregation_method: str = "mean"  # or 'last', or 'first'
+#### Attention Block #####
+# Composed of SelfAttentionLayer and MLP with residual connections
+class Block(nn.Module):
+    def __init__(self, config: AttentionConfig, layer_idx: int):
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.attn = SelfAttentionLayer(config, layer_idx)
+        self.mlp = MLP(config)
+    def forward(self, x, cos_sin):
+        x = x + self.attn(norm(x), cos_sin)
+        x = x + self.mlp(norm(x))
+        return x
+##### Components of the Block #####
+class SelfAttentionLayer(nn.Module):
+    def __init__(self, config: AttentionConfig, layer_idx):
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.n_head = config.n_head
+        self.n_kv_head = config.n_kv_head
+        self.enable_gqa = (
+            self.n_head != self.n_kv_head
+        )  # Group Query Attention (GQA): duplicate key/value heads to match query heads if desired
+        self.n_embd = config.n_embd
+        self.head_dim = self.n_embd // self.n_head
+        assert self.n_embd % self.n_head == 0
+        assert self.n_kv_head <= self.n_head and self.n_head % self.n_kv_head == 0
+        self.c_q = nn.Linear(self.n_embd, self.n_head * self.head_dim, bias=False)
+        self.c_k = nn.Linear(self.n_embd, self.n_kv_head * self.head_dim, bias=False)
+        self.c_v = nn.Linear(self.n_embd, self.n_kv_head * self.head_dim, bias=False)
+        self.c_proj = nn.Linear(self.n_embd, self.n_embd, bias=False)
+        self.apply_positional_encoding = config.positional_encoding
+    def forward(self, x, cos_sin=None):
+        B, T, C = x.size()
+        # Project the input to get queries, keys, and values
+        q = self.c_q(x).view(B, T, self.n_head, self.head_dim)
+        k = self.c_k(x).view(B, T, self.n_kv_head, self.head_dim)
+        v = self.c_v(x).view(B, T, self.n_kv_head, self.head_dim)
+        if self.apply_positional_encoding:
+            assert cos_sin is not None, "Rotary embeddings require precomputed cos/sin tensors"
+            cos, sin = cos_sin
+            q, k = (
+                apply_rotary_emb(q, cos, sin),
+                apply_rotary_emb(k, cos, sin),
+            )  # QK rotary embedding
+        q, k = norm(q), norm(k)  # QK norm
+        q, k, v = (
+            q.transpose(1, 2),
+            k.transpose(1, 2),
+            v.transpose(1, 2),
+        )  # make head be batch dim, i.e. (B, T, H, D) -> (B, H, T, D)
+        # is_causal=False for non-autoregressive models (BERT-like)
+        y = F.scaled_dot_product_attention(q, k, v, is_causal=False, enable_gqa=self.enable_gqa)
+        # Re-assemble the heads side by side and project back to residual stream
+        y = y.transpose(1, 2).contiguous().view(B, T, -1)
+        y = self.c_proj(y)
+        return y
+class MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=False)
+        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=False)
+    def forward(self, x):
+        x = self.c_fc(x)
+        x = F.relu(x).square()
+        x = self.c_proj(x)
+        return x

torchTextClassifiers/model/components/categorical_var_net.py ADDED Viewed

@@ -0,0 +1,128 @@
+from enum import Enum
+from typing import List, Optional, Union
+import torch
+from torch import nn
+class CategoricalForwardType(Enum):
+    SUM_TO_TEXT = "EMBEDDING_SUM_TO_TEXT"
+    AVERAGE_AND_CONCAT = "EMBEDDING_AVERAGE_AND_CONCAT"
+    CONCATENATE_ALL = "EMBEDDING_CONCATENATE_ALL"
+class CategoricalVariableNet(nn.Module):
+    def __init__(
+        self,
+        categorical_vocabulary_sizes: List[int],
+        categorical_embedding_dims: Optional[Union[List[int], int]] = None,
+        text_embedding_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        self.categorical_vocabulary_sizes = categorical_vocabulary_sizes
+        self.categorical_embedding_dims = categorical_embedding_dims
+        self.text_embedding_dim = text_embedding_dim
+        self._validate_categorical_inputs()
+        assert isinstance(
+            self.forward_type, CategoricalForwardType
+        ), "forward_type must be set after validation"
+        assert isinstance(self.output_dim, int), "output_dim must be set as int after validation"
+        self.categorical_embedding_layers = {}
+        for var_idx, num_rows in enumerate(self.categorical_vocabulary_sizes):
+            emb_layer = nn.Embedding(
+                num_embeddings=num_rows,
+                embedding_dim=self.categorical_embedding_dims[var_idx],
+            )
+            self.categorical_embedding_layers[var_idx] = emb_layer
+            setattr(self, f"categorical_embedding_{var_idx}", emb_layer)
+    def forward(self, categorical_vars_tensor: torch.Tensor) -> torch.Tensor:
+        cat_embeds = self._get_cat_embeds(categorical_vars_tensor)
+        if self.forward_type == CategoricalForwardType.SUM_TO_TEXT:
+            x_combined = torch.stack(cat_embeds, dim=0).sum(dim=0)  # (bs, text_embed_dim)
+        elif self.forward_type == CategoricalForwardType.AVERAGE_AND_CONCAT:
+            x_combined = torch.stack(cat_embeds, dim=0).mean(dim=0)  # (bs, embed_dim)
+        elif self.forward_type == CategoricalForwardType.CONCATENATE_ALL:
+            x_combined = torch.cat(cat_embeds, dim=1)  # (bs, sum of all cat embed dims)
+        else:
+            raise ValueError(f"Unknown forward type: {self.forward_type}")
+        assert (
+            x_combined.dim() == 2
+        ), "Output combined tensor must be 2-dimensional (batch_size, embed_dim)"
+        assert x_combined.size(1) == self.output_dim
+        return x_combined
+    def _get_cat_embeds(self, categorical_vars_tensor: torch.Tensor):
+        if categorical_vars_tensor.dtype != torch.long:
+            categorical_vars_tensor = categorical_vars_tensor.to(torch.long)
+        cat_embeds = []
+        for i, embed_layer in self.categorical_embedding_layers.items():
+            cat_var_tensor = categorical_vars_tensor[:, i]
+            # Check if categorical values are within valid range
+            vocab_size = embed_layer.num_embeddings
+            max_val = cat_var_tensor.max().item()
+            min_val = cat_var_tensor.min().item()
+            if max_val >= vocab_size or min_val < 0:
+                raise ValueError(
+                    f"Categorical feature {i}: values range [{min_val}, {max_val}] exceed vocabulary size {vocab_size}."
+                )
+            cat_embed = embed_layer(cat_var_tensor)
+            if cat_embed.dim() > 2:
+                cat_embed = cat_embed.squeeze(1)
+            cat_embeds.append(cat_embed)
+        return cat_embeds
+    def _validate_categorical_inputs(self):
+        categorical_vocabulary_sizes = self.categorical_vocabulary_sizes
+        categorical_embedding_dims = self.categorical_embedding_dims
+        if not isinstance(categorical_vocabulary_sizes, list):
+            raise TypeError("categorical_vocabulary_sizes must be a list of int")
+        if isinstance(categorical_embedding_dims, list):
+            if len(categorical_vocabulary_sizes) != len(categorical_embedding_dims):
+                raise ValueError(
+                    "Categorical vocabulary sizes and their embedding dimensions must have the same length"
+                )
+        num_categorical_features = len(categorical_vocabulary_sizes)
+        # "Transform" embedding dims into a suitable list, or stay None
+        if categorical_embedding_dims is not None:
+            if isinstance(categorical_embedding_dims, int):
+                self.forward_type = CategoricalForwardType.AVERAGE_AND_CONCAT
+                self.output_dim = categorical_embedding_dims
+                categorical_embedding_dims = [categorical_embedding_dims] * num_categorical_features
+            elif isinstance(categorical_embedding_dims, list):
+                self.forward_type = CategoricalForwardType.CONCATENATE_ALL
+                self.output_dim = sum(categorical_embedding_dims)
+            else:
+                raise TypeError("categorical_embedding_dims must be an int, a list of int or None")
+        else:
+            if self.text_embedding_dim is None:
+                raise ValueError(
+                    "If categorical_embedding_dims is None, text_embedding_dim must be provided"
+                )
+            self.forward_type = CategoricalForwardType.SUM_TO_TEXT
+            self.output_dim = self.text_embedding_dim
+            categorical_embedding_dims = [self.text_embedding_dim] * num_categorical_features
+        assert (
+            isinstance(categorical_embedding_dims, list) or categorical_embedding_dims is None
+        ), "categorical_embedding_dims must be a list of int at this point"
+        self.categorical_vocabulary_sizes = categorical_vocabulary_sizes
+        self.categorical_embedding_dims = categorical_embedding_dims
+        self.num_categorical_features = num_categorical_features

torchTextClassifiers/model/components/classification_head.py ADDED Viewed

@@ -0,0 +1,61 @@
+from typing import Optional
+import torch
+from torch import nn
+class ClassificationHead(nn.Module):
+    def __init__(
+        self,
+        input_dim: Optional[int] = None,
+        num_classes: Optional[int] = None,
+        net: Optional[nn.Module] = None,
+    ):
+        """
+        Classification head for text classification tasks.
+        It is a nn.Module that can either be a simple Linear layer or a custom neural network module.
+        Args:
+            input_dim (int, optional): Dimension of the input features. Required if net is not provided.
+            num_classes (int, optional): Number of output classes. Required if net is not provided.
+            net (nn.Module, optional): Custom neural network module to be used as the classification head.
+                If provided, input_dim and num_classes are inferred from this module.
+                Should be either an nn.Sequential with first and last layers being Linears or nn.Linear.
+        """
+        super().__init__()
+        if net is not None:
+            self.net = net
+            # --- Custom net should either be a Sequential or a Linear ---
+            if not (isinstance(net, nn.Sequential) or isinstance(net, nn.Linear)):
+                raise ValueError("net must be an nn.Sequential when provided.")
+            # --- If Sequential, Check first and last layers are Linear ---
+            if isinstance(net, nn.Sequential):
+                first = net[0]
+                last = net[-1]
+                if not isinstance(first, nn.Linear):
+                    raise TypeError(f"First layer must be nn.Linear, got {type(first).__name__}.")
+                if not isinstance(last, nn.Linear):
+                    raise TypeError(f"Last layer must be nn.Linear, got {type(last).__name__}.")
+                # --- Extract features ---
+                self.input_dim = first.in_features
+                self.num_classes = last.out_features
+            else:  # if not Sequential, it is a Linear
+                self.input_dim = net.in_features
+                self.num_classes = net.out_features
+        else:
+            assert (
+                input_dim is not None and num_classes is not None
+            ), "Either net or both input_dim and num_classes must be provided."
+            self.net = nn.Linear(input_dim, num_classes)
+            self.input_dim = input_dim
+            self.num_classes = num_classes
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.net(x)

torchtextclassifiers 0.0.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

torchtextclassifiers 0.0.1py3-none-any.whl → 1.0.0py3-none-any.whl