PyPI - torchtextclassifiers - Versions diffs - 0.0.1__py3-none-any.whl - Mend

torchtextclassifiers 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

torchTextClassifiers/__init__.py +68 -0
torchTextClassifiers/classifiers/base.py +83 -0
torchTextClassifiers/classifiers/fasttext/__init__.py +25 -0
torchTextClassifiers/classifiers/fasttext/core.py +269 -0
torchTextClassifiers/classifiers/fasttext/model.py +752 -0
torchTextClassifiers/classifiers/fasttext/tokenizer.py +346 -0
torchTextClassifiers/classifiers/fasttext/wrapper.py +216 -0
torchTextClassifiers/classifiers/simple_text_classifier.py +191 -0
torchTextClassifiers/factories.py +34 -0
torchTextClassifiers/torchTextClassifiers.py +509 -0
torchTextClassifiers/utilities/__init__.py +3 -0
torchTextClassifiers/utilities/checkers.py +108 -0
torchTextClassifiers/utilities/preprocess.py +82 -0
torchTextClassifiers/utilities/utils.py +346 -0
torchtextclassifiers-0.0.1.dist-info/METADATA +187 -0
torchtextclassifiers-0.0.1.dist-info/RECORD +17 -0
torchtextclassifiers-0.0.1.dist-info/WHEEL +4 -0

torchTextClassifiers/classifiers/simple_text_classifier.py ADDED Viewed

@@ -0,0 +1,191 @@
+"""
+Simple text classifier example that doesn't require a tokenizer.
+This demonstrates how to create a classifier wrapper that uses
+different text preprocessing approaches.
+"""
+from typing import Optional, Dict, Any
+from dataclasses import dataclass, asdict
+import numpy as np
+import torch
+import torch.nn as nn
+from sklearn.feature_extraction.text import TfidfVectorizer
+from torch.utils.data import Dataset, DataLoader
+import pytorch_lightning as pl
+from torch.optim import Adam
+from .base import BaseClassifierWrapper, BaseClassifierConfig
+@dataclass
+class SimpleTextConfig(BaseClassifierConfig):
+    """Configuration for simple text classifier using TF-IDF."""
+    hidden_dim: int = 128
+    num_classes: Optional[int] = None
+    max_features: int = 10000
+    learning_rate: float = 1e-3
+    dropout_rate: float = 0.1
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "SimpleTextConfig":
+        return cls(**data)
+class SimpleTextDataset(Dataset):
+    """Dataset for simple text classifier."""
+    def __init__(self, features: np.ndarray, labels: np.ndarray):
+        self.features = torch.FloatTensor(features)
+        self.labels = torch.LongTensor(labels)
+    def __len__(self):
+        return len(self.features)
+    def __getitem__(self, idx):
+        return self.features[idx], self.labels[idx]
+class SimpleTextModel(nn.Module):
+    """Simple neural network for text classification using TF-IDF features."""
+    def __init__(self, input_dim: int, hidden_dim: int, num_classes: int, dropout_rate: float = 0.1):
+        super().__init__()
+        self.network = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(dropout_rate),
+            nn.Linear(hidden_dim, hidden_dim // 2),
+            nn.ReLU(),
+            nn.Dropout(dropout_rate),
+            nn.Linear(hidden_dim // 2, num_classes)
+        )
+    def forward(self, x):
+        return self.network(x)
+class SimpleTextModule(pl.LightningModule):
+    """Lightning module for simple text classifier."""
+    def __init__(self, model: nn.Module, learning_rate: float = 1e-3):
+        super().__init__()
+        self.model = model
+        self.learning_rate = learning_rate
+        self.loss_fn = nn.CrossEntropyLoss()
+    def forward(self, x):
+        return self.model(x)
+    def training_step(self, batch, batch_idx):
+        features, labels = batch
+        logits = self(features)
+        loss = self.loss_fn(logits, labels)
+        self.log('train_loss', loss)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        features, labels = batch
+        logits = self(features)
+        loss = self.loss_fn(logits, labels)
+        self.log('val_loss', loss)
+        return loss
+    def configure_optimizers(self):
+        return Adam(self.parameters(), lr=self.learning_rate)
+class SimpleTextWrapper(BaseClassifierWrapper):
+    """Wrapper for simple text classifier that uses TF-IDF instead of tokenization."""
+    def __init__(self, config: SimpleTextConfig):
+        super().__init__(config)
+        self.config: SimpleTextConfig = config
+        self.vectorizer: Optional[TfidfVectorizer] = None
+    def prepare_text_features(self, training_text: np.ndarray) -> None:
+        """Prepare TF-IDF vectorizer instead of tokenizer."""
+        self.vectorizer = TfidfVectorizer(
+            max_features=self.config.max_features,
+            lowercase=True,
+            stop_words='english'
+        )
+        # Fit the vectorizer on training text
+        self.vectorizer.fit(training_text)
+    def _build_pytorch_model(self) -> None:
+        """Build the PyTorch model."""
+        if self.vectorizer is None:
+            raise ValueError("Must call prepare_text_features first")
+        input_dim = len(self.vectorizer.get_feature_names_out())
+        self.pytorch_model = SimpleTextModel(
+            input_dim=input_dim,
+            hidden_dim=self.config.hidden_dim,
+            num_classes=self.config.num_classes,
+            dropout_rate=self.config.dropout_rate
+        )
+    def _check_and_init_lightning(self, **kwargs) -> None:
+        """Initialize Lightning module."""
+        self.lightning_module = SimpleTextModule(
+            model=self.pytorch_model,
+            learning_rate=self.config.learning_rate
+        )
+    def predict(self, X: np.ndarray, **kwargs) -> np.ndarray:
+        """Make predictions."""
+        if not self.trained:
+            raise Exception("Model must be trained first.")
+        # Extract text from X (assuming first column is text)
+        text_data = X[:, 0] if X.ndim > 1 else X
+        # Transform text to TF-IDF features
+        features = self.vectorizer.transform(text_data).toarray()
+        features_tensor = torch.FloatTensor(features)
+        self.pytorch_model.eval()
+        with torch.no_grad():
+            logits = self.pytorch_model(features_tensor)
+            predictions = torch.argmax(logits, dim=1)
+        return predictions.numpy()
+    def validate(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
+        """Validate the model."""
+        predictions = self.predict(X)
+        accuracy = (predictions == Y).mean()
+        return float(accuracy)
+    def create_dataset(self, texts: np.ndarray, labels: np.ndarray, categorical_variables: Optional[np.ndarray] = None):
+        """Create dataset."""
+        # Transform text to TF-IDF features
+        features = self.vectorizer.transform(texts).toarray()
+        return SimpleTextDataset(features, labels)
+    def create_dataloader(self, dataset, batch_size: int, num_workers: int = 0, shuffle: bool = True):
+        """Create dataloader."""
+        return DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=shuffle)
+    def load_best_model(self, checkpoint_path: str) -> None:
+        """Load best model from checkpoint."""
+        self.lightning_module = SimpleTextModule.load_from_checkpoint(
+            checkpoint_path,
+            model=self.pytorch_model,
+            learning_rate=self.config.learning_rate
+        )
+        self.pytorch_model = self.lightning_module.model
+        self.trained = True
+        self.pytorch_model.eval()
+    @classmethod
+    def get_config_class(cls):
+        """Return the configuration class."""
+        return SimpleTextConfig

torchTextClassifiers/factories.py ADDED Viewed

@@ -0,0 +1,34 @@
+"""Generic factories for different classifier types."""
+from typing import Dict, Any, Optional, Type, Callable
+from .classifiers.base import BaseClassifierConfig
+# Registry of config factories for different classifier types
+CONFIG_FACTORIES: Dict[str, Callable[[dict], BaseClassifierConfig]] = {}
+def register_config_factory(classifier_type: str, factory_func: Callable[[dict], BaseClassifierConfig]):
+    """Register a config factory for a classifier type."""
+    CONFIG_FACTORIES[classifier_type] = factory_func
+def create_config_from_dict(classifier_type: str, config_dict: dict) -> BaseClassifierConfig:
+    """Create a config object from dictionary based on classifier type."""
+    if classifier_type not in CONFIG_FACTORIES:
+        raise ValueError(f"Unsupported classifier type: {classifier_type}")
+    return CONFIG_FACTORIES[classifier_type](config_dict)
+# Register FastText factory
+def _register_fasttext_factory():
+    """Register FastText config factory."""
+    try:
+        from .classifiers.fasttext.core import FastTextFactory
+        register_config_factory("fasttext", FastTextFactory.from_dict)
+    except ImportError:
+        pass  # FastText module not available
+# Auto-register available factories
+_register_fasttext_factory()