PyPI - torchtextclassifiers - Versions diffs - 0.0.1__py3-none-any.whl → 0.1.0__py3-none-any.whl - Mend

torchtextclassifiers 0.0.1py3-none-any.whl → 0.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

torchTextClassifiers/__init__.py +12 -48
torchTextClassifiers/dataset/__init__.py +1 -0
torchTextClassifiers/dataset/dataset.py +114 -0
torchTextClassifiers/model/__init__.py +2 -0
torchTextClassifiers/model/components/__init__.py +12 -0
torchTextClassifiers/model/components/attention.py +126 -0
torchTextClassifiers/model/components/categorical_var_net.py +128 -0
torchTextClassifiers/model/components/classification_head.py +43 -0
torchTextClassifiers/model/components/text_embedder.py +220 -0
torchTextClassifiers/model/lightning.py +166 -0
torchTextClassifiers/model/model.py +151 -0
torchTextClassifiers/tokenizers/WordPiece.py +92 -0
torchTextClassifiers/tokenizers/__init__.py +10 -0
torchTextClassifiers/tokenizers/base.py +205 -0
torchTextClassifiers/tokenizers/ngram.py +472 -0
torchTextClassifiers/torchTextClassifiers.py +463 -405
torchTextClassifiers/utilities/__init__.py +0 -3
torchTextClassifiers/utilities/plot_explainability.py +184 -0
torchtextclassifiers-0.1.0.dist-info/METADATA +73 -0
torchtextclassifiers-0.1.0.dist-info/RECORD +21 -0
{torchtextclassifiers-0.0.1.dist-info → torchtextclassifiers-0.1.0.dist-info}/WHEEL +1 -1
torchTextClassifiers/classifiers/base.py +0 -83
torchTextClassifiers/classifiers/fasttext/__init__.py +0 -25
torchTextClassifiers/classifiers/fasttext/core.py +0 -269
torchTextClassifiers/classifiers/fasttext/model.py +0 -752
torchTextClassifiers/classifiers/fasttext/tokenizer.py +0 -346
torchTextClassifiers/classifiers/fasttext/wrapper.py +0 -216
torchTextClassifiers/classifiers/simple_text_classifier.py +0 -191
torchTextClassifiers/factories.py +0 -34
torchTextClassifiers/utilities/checkers.py +0 -108
torchTextClassifiers/utilities/preprocess.py +0 -82
torchTextClassifiers/utilities/utils.py +0 -346
torchtextclassifiers-0.0.1.dist-info/METADATA +0 -187
torchtextclassifiers-0.0.1.dist-info/RECORD +0 -17

torchTextClassifiers/utilities/utils.py DELETED Viewed

@@ -1,346 +0,0 @@
-"""
-Utility functions.
-"""
-import warnings
-import difflib
-from difflib import SequenceMatcher
-import torch
-import torch.nn.functional as F
-from .preprocess import clean_text_feature
-def preprocess_token(token):
-    preprocessed_token = token.replace("</s>", "")
-    preprocessed_token = preprocessed_token.replace("<", "")
-    preprocessed_token = preprocessed_token.replace(">", "")
-    preprocessed_token = preprocessed_token.split()
-    return preprocessed_token
-def map_processed_to_original(processed_words, original_words, n=1, cutoff=0.9):
-    """
-    Map processed words to original words based on similarity scores.
-    Args:
-        processed_words (List[str]): List of processed words.
-        original_words (List[str]): List of original words.
-        n (int): Number of closest processed words to consider for a given original word.
-        cutoff (float): Minimum similarity score for a match.
-    Returns:
-        Dict[str, str]: Mapping from original word to the corresponding closest processed word.
-    """
-    # For each word in the original list, find the n closest matching processed words
-    word_mapping = {}
-    for original_word in original_words:
-        original_word_prepro = clean_text_feature([original_word], remove_stop_words=False)[
-            0
-        ]  # Preprocess the original word
-        if original_word_prepro == "":
-            continue
-        max_similarity_score = 0
-        best_processed_word = None
-        # Calculate the similarity score for each processed word with the current original word
-        for processed_word in processed_words:
-            similarity_score = difflib.SequenceMatcher(
-                None, processed_word, original_word_prepro
-            ).ratio()  # Ratcliff-Obershelp algorithm
-            # Only consider matches with similarity above the cutoff
-            if similarity_score > max_similarity_score and similarity_score >= cutoff:
-                max_similarity_score = similarity_score
-                best_processed_word = processed_word
-        if best_processed_word is not None:
-            # original_word = original_word.replace(',', '')
-            # Add the tuple (list of closest words, list of similarity scores) to the mapping
-            word_mapping[original_word] = best_processed_word
-    return word_mapping
-def test_end_of_word(all_processed_words, word, target_token, next_token, min_n):
-    flag = False
-    if target_token[-1] == ">":
-        if next_token[0] == "<":
-            if word in target_token:
-                flag = True
-            if word in next_token:
-                flag = False
-            if next_token[1] != word[0]:
-                flag = True
-            if len(next_token) == min_n:
-                flag = True
-        if next_token in all_processed_words:
-            flag = True
-    return flag
-def match_word_to_token_indexes(sentence, tokenized_sentence_tokens, min_n):
-    """
-    Match words to token indexes in a sentence.
-    Args:
-        sentence (str): Preprocessed sentence.
-        tokenized_sentence_tokens (List[str]): List of tokenized sentence tokens.
-    Returns:
-        Dict[str, List[int]]: Mapping from word to list of token indexes.
-    """
-    pointer_token = 0
-    res = {}
-    processed_sentence = clean_text_feature([sentence], remove_stop_words=False)[0]
-    processed_words = processed_sentence.split()
-    # we know the tokens are in the right order
-    for index_word, word in enumerate(processed_words):
-        if word not in res:
-            res[word] = []
-        start = pointer_token
-        # while we don't reach the end of the word, get going
-        while not test_end_of_word(
-            processed_words,
-            word,
-            tokenized_sentence_tokens[pointer_token],
-            tokenized_sentence_tokens[pointer_token + 1],
-            min_n=min_n,
-        ):
-            pointer_token += 1
-            if pointer_token == len(tokenized_sentence_tokens) - 1:
-                warnings.warn("Error in the tokenization of the sentence")
-                # workaround to avoid error: each word is asociated to regular ranges
-                chunck = len(tokenized_sentence_tokens) // len(processed_words)
-                for idx, word in enumerate(processed_words):
-                    res[word] = range(
-                        idx * chunck, min((idx + 1) * chunck, len(tokenized_sentence_tokens))
-                    )
-                return res
-        pointer_token += 1
-        end = pointer_token
-        res[word] += list(range(start, end))
-    # here we arrive at the end of the sentence
-    assert tokenized_sentence_tokens[pointer_token] == "</s>"
-    end_of_string_position = pointer_token
-    # starting word n_gram
-    pointer_token += 1
-    while pointer_token < len(tokenized_sentence_tokens):
-        token = tokenized_sentence_tokens[pointer_token]
-        for index_word, word in enumerate(processed_sentence.split()):
-            # now, the condition of matching changes: we need to find the word in the token
-            if word in token:
-                res[word].append(pointer_token)
-        pointer_token += 1
-    assert pointer_token == len(tokenized_sentence_tokens)
-    assert set(sum([v for v in res.values()], [end_of_string_position])) == set(
-        range(len(tokenized_sentence_tokens))
-    ), print(
-        set(range(len(tokenized_sentence_tokens)))
-        - set(sum([v for v in res.values()], [end_of_string_position]))
-    )  # verify if all tokens are used
-    return res
-# at text level
-def compute_preprocessed_word_score(
-    preprocessed_text,
-    tokenized_text_tokens,
-    scores,
-    id_to_token_dicts,
-    token_to_id_dicts,
-    min_n,
-    padding_index=2009603,
-    end_of_string_index=0,
-):
-    """
-    Compute preprocessed word scores based on token scores.
-    Args:
-        preprocessed_text (List[str]): List of preprocessed sentences.
-        tokenized_text (List[List[int]]): For each sentence, list of token IDs.
-        scores (List[torch.Tensor]): For each sentence, list of token scores.
-        id_to_token_dicts (List[Dict[int, str]]): For each sentence, mapping from token ID to token in string form.
-        token_to_id_dicts (List[Dict[str, int]]): For each sentence, mapping from token (string) to token ID.
-        padding_index (int): Index of padding token.
-        end_of_string_index (int): Index of end of string token.
-        aggregate (bool): Whether to aggregate scores at word level (if False, stay at token level).
-    Returns:
-        List[Dict[str, float]]: For each sentence, mapping from preprocessed word to score.
-    """
-    word_to_score_dicts = []
-    word_to_token_idx_dicts = []
-    for idx, sentence in enumerate(preprocessed_text):
-        tokenized_sentence_tokens = tokenized_text_tokens[idx]  # sentence level, List[str]
-        word_to_token_idx = match_word_to_token_indexes(sentence, tokenized_sentence_tokens, min_n)
-        score_sentence_topk = scores[idx]  # torch.Tensor, token scores, (top_k, seq_len)
-        # Calculate the score for each token and map to words
-        word_to_score_topk = []
-        for k in range(len(score_sentence_topk)):
-            # Initialize word-to-score dictionary with zero values
-            word_to_score = {word: 0 for word in sentence.split()}
-            score_sentence = score_sentence_topk[k]
-            for word, associated_token_idx in word_to_token_idx.items():
-                associated_token_idx = torch.tensor(associated_token_idx).int()
-                word_to_score[word] = torch.sum(score_sentence[associated_token_idx]).item()
-            word_to_score_topk.append(word_to_score.copy())
-        word_to_score_dicts.append(word_to_score_topk)
-        word_to_token_idx_dicts.append(word_to_token_idx)
-    return word_to_score_dicts, word_to_token_idx_dicts
-def compute_word_score(word_to_score_dicts, text, n=5, cutoff=0.75):
-    """
-    Compute word scores based on preprocessed word scores.
-    Args:
-        word_to_score_dicts (List[List[Dict[str, float]]]): For each sentence, list of top_k mappings from preprocessed word to score.
-        text (List[str]): List of sentences.
-        n (int): Number of closest preprocessed words to consider for a given original word.
-        cutoff (float): Minimum similarity score for a match.
-    Returns:
-        List[List[List[float]]]: For each sentence, list of top-k scores for each word.
-    """
-    all_scores_text = []
-    mappings = []
-    for idx, word_to_score_topk in enumerate(word_to_score_dicts):  # iteration over sentences
-        all_scores_topk = []
-        processed_words = list(word_to_score_topk[0].keys())
-        original_words = text[idx].split()
-        original_words = list(filter(lambda x: x != ",", original_words))
-        mapping = map_processed_to_original(
-            processed_words, original_words, n=n, cutoff=cutoff
-        )  # Dict[str, Tuple[List[str], List[float]]]
-        mappings.append(mapping)
-        for word_to_score in word_to_score_topk:  # iteration over top_k (the preds)
-            scores = []
-            stopwords_idx = []
-            for pos_word, word in enumerate(original_words):
-                if word not in mapping:
-                    scores.append(0)
-                    stopwords_idx.append(pos_word)
-                    continue
-                matching_processed_word = mapping[word]
-                word_score = word_to_score[matching_processed_word]
-                scores.append(word_score)
-            scores = torch.tensor(scores)
-            scores = F.softmax(
-                scores, dim=-1
-            )  # softmax normalization. Length = len(original_words)
-            scores[stopwords_idx] = 0
-            all_scores_topk.append(scores)  # length top_k
-        all_scores_text.append(all_scores_topk)  # length = len(text)
-    return all_scores_text, mappings
-def explain_continuous(
-    text, processed_text, tokenized_text_tokens, mappings, word_to_token_idx_dicts, all_attr, top_k
-):
-    """
-    Score explanation at letter level.
-    Args:
-        text (List[str]): List of original sentences.
-        processed_text (List[str]): List of preprocessed sentences.
-        tokenized_text_tokens (List[List[str]]): List of tokenized sentences.
-        mappings (List[Dict[str, str]]): List of mappings from original word to preprocessed word.
-        word_to_token_idx_dicts (List[Dict[str, List[int]]]): List of mappings from preprocessed word to token indexes.
-        all_attr (torch.Tensor): Tensor of token scores.
-        top_k (int): Number of top tokens to consider.
-    Returns:
-        List[torch.Tensor]: List of letter scores for each sentence.
-    """
-    all_scores_text = []
-    for idx, processed_sentence in enumerate(processed_text):
-        tokenized_sentence_tokens = tokenized_text_tokens[idx]
-        mapping = mappings[idx]
-        word_to_token_idx = word_to_token_idx_dicts[idx]
-        original_words = text[idx].split()
-        original_words = list(filter(lambda x: x != ",", original_words))
-        original_to_token = {}
-        original_to_token_idxs = {}
-        for original in original_words:
-            # original = original.replace(',', '')
-            if original not in mapping:
-                continue
-            matching_processed_word = mapping[original]
-            associated_token_idx = word_to_token_idx[matching_processed_word]
-            original_to_token[original] = [
-                tokenized_sentence_tokens[token_idx] for token_idx in associated_token_idx
-            ]
-            original_to_token_idxs[original] = associated_token_idx
-        scores_for_k = []
-        for k in range(top_k):
-            scores_for_words = []
-            for xxx, original_word in enumerate(original_words):
-                original_word_prepro = clean_text_feature([original_word], remove_stop_words=False)[
-                    0
-                ]
-                letters = list(original_word)
-                scores_letter = torch.zeros(len(letters), dtype=torch.float32)
-                if original_word not in original_to_token:  # if stopword, 0
-                    scores_for_words.append(scores_letter)
-                    continue
-                for pos, token in enumerate(original_to_token[original_word]):
-                    pos_token = original_to_token_idxs[original_word][pos]
-                    # tok = preprocess_token(token)[0]
-                    tok = preprocess_token(token)
-                    score_token = all_attr[idx, k, pos_token].item()
-                    # Embed the token at the right indexes of the word
-                    sm = SequenceMatcher(None, original_word_prepro, tok)
-                    a, _, size = sm.find_longest_match()
-                    scores_letter[a : a + size] += score_token
-                scores_for_words.append(scores_letter)
-            all_scores_letter = torch.cat(scores_for_words)
-            scores = F.softmax(all_scores_letter, dim=-1)
-            scores[all_scores_letter == 0] = 0
-            scores_for_k.append(scores)
-        scores_for_sentence = torch.stack(scores_for_k)
-        all_scores_text.append(scores_for_sentence)
-    return torch.stack(all_scores_text)

torchtextclassifiers-0.0.1.dist-info/METADATA DELETED Viewed

@@ -1,187 +0,0 @@
-Metadata-Version: 2.3
-Name: torchtextclassifiers
-Version: 0.0.1
-Summary: An implementation of the https://github.com/facebookresearch/fastText supervised learning algorithm for text classification using Pytorch.
-Keywords: fastText,text classification,NLP,automatic coding,deep learning
-Author: Tom Seimandi, Julien Pramil, Meilame Tayebjee, Cédric Couralet
-Author-email: Tom Seimandi <tom.seimandi@gmail.com>, Julien Pramil <julien.pramil@insee.fr>, Meilame Tayebjee <meilame.tayebjee@insee.fr>, Cédric Couralet <cedric.couralet@insee.fr>
-Classifier: Programming Language :: Python :: 3
-Classifier: License :: OSI Approved :: MIT License
-Classifier: Operating System :: OS Independent
-Requires-Dist: numpy>=1.26.4
-Requires-Dist: pytorch-lightning>=2.4.0
-Requires-Dist: unidecode ; extra == 'explainability'
-Requires-Dist: nltk ; extra == 'explainability'
-Requires-Dist: captum ; extra == 'explainability'
-Requires-Dist: unidecode ; extra == 'preprocess'
-Requires-Dist: nltk ; extra == 'preprocess'
-Requires-Python: >=3.11
-Provides-Extra: explainability
-Provides-Extra: preprocess
-Description-Content-Type: text/markdown
-# torchTextClassifiers
-A unified, extensible framework for text classification built on [PyTorch](https://pytorch.org/) and [PyTorch Lightning](https://lightning.ai/docs/pytorch/stable/).
-## 🚀 Features
-- **Unified API**: Consistent interface for different classifier wrappers
-- **Extensible**: Easy to add new classifier implementations through wrapper pattern
-- **FastText Support**: Built-in FastText classifier with n-gram tokenization
-- **Flexible Preprocessing**: Each classifier can implement its own text preprocessing approach
-- **PyTorch Lightning**: Automated training with callbacks, early stopping, and logging
-## 📦 Installation
-```bash
-# Clone the repository
-git clone https://github.com/InseeFrLab/torchTextClassifiers.git
-cd torchtextClassifiers
-# Install with uv (recommended)
-uv sync
-# Or install with pip
-pip install -e .
-```
-## 🎯 Quick Start
-### Basic FastText Classification
-```python
-import numpy as np
-from torchTextClassifiers import create_fasttext
-# Create a FastText classifier
-classifier = create_fasttext(
-    embedding_dim=100,
-    sparse=False,
-    num_tokens=10000,
-    min_count=2,
-    min_n=3,
-    max_n=6,
-    len_word_ngrams=2,
-    num_classes=2
-)
-# Prepare your data
-X_train = np.array([
-    "This is a positive example",
-    "This is a negative example",
-    "Another positive case",
-    "Another negative case"
-])
-y_train = np.array([1, 0, 1, 0])
-X_val = np.array([
-    "Validation positive",
-    "Validation negative"
-])
-y_val = np.array([1, 0])
-# Build the model
-classifier.build(X_train, y_train)
-# Train the model
-classifier.train(
-    X_train, y_train, X_val, y_val,
-    num_epochs=50,
-    batch_size=32,
-    patience_train=5,
-    verbose=True
-)
-# Make predictions
-X_test = np.array(["This is a test sentence"])
-predictions = classifier.predict(X_test)
-print(f"Predictions: {predictions}")
-# Validate on test set
-accuracy = classifier.validate(X_test, np.array([1]))
-print(f"Accuracy: {accuracy:.3f}")
-```
-### Custom Classifier Implementation
-```python
-import numpy as np
-from torchTextClassifiers import torchTextClassifiers
-from torchTextClassifiers.classifiers.simple_text_classifier import SimpleTextWrapper, SimpleTextConfig
-# Example: TF-IDF based classifier (alternative to tokenization)
-config = SimpleTextConfig(
-    hidden_dim=128,
-    num_classes=2,
-    max_features=5000,
-    learning_rate=1e-3,
-    dropout_rate=0.2
-)
-# Create classifier with TF-IDF preprocessing
-wrapper = SimpleTextWrapper(config)
-classifier = torchTextClassifiers(wrapper)
-# Text data
-X_train = np.array(["Great product!", "Terrible service", "Love it!"])
-y_train = np.array([1, 0, 1])
-# Build and train
-classifier.build(X_train, y_train)
-# ... continue with training
-```
-### Training Customization
-```python
-# Custom PyTorch Lightning trainer parameters
-trainer_params = {
-    'accelerator': 'gpu',
-    'devices': 1,
-    'precision': 16,  # Mixed precision training
-    'gradient_clip_val': 1.0,
-}
-classifier.train(
-    X_train, y_train, X_val, y_val,
-    num_epochs=100,
-    batch_size=64,
-    patience_train=10,
-    trainer_params=trainer_params,
-    verbose=True
-)
-```
-## 🔬 Testing
-Run the test suite:
-```bash
-# Run all tests
-uv run pytest
-# Run with coverage
-uv run pytest --cov=torchTextClassifiers
-# Run specific test file
-uv run pytest tests/test_torchTextClassifiers.py -v
-```
-## 📚 Examples
-See the [examples/](examples/) directory for:
-- Basic text classification
-- Multi-class classification
-- Mixed features (text + categorical)
-- Custom classifier implementation
-- Advanced training configurations
-## 📄 License
-This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.

torchtextclassifiers-0.0.1.dist-info/RECORD DELETED Viewed

@@ -1,17 +0,0 @@
-torchTextClassifiers/__init__.py,sha256=dc77f92c57d9a0782777f83e955157be26ab0bce60434877a7361d1492978279,2228
-torchTextClassifiers/classifiers/base.py,sha256=549669aca59fcdbca53d6c240e40e1f282d71dd99d9eb18010d37ae2a5843ce6,2796
-torchTextClassifiers/classifiers/fasttext/__init__.py,sha256=e326a8f1f6018ea57715f94b5d14c1b18254115088911bb4e7c4f472d2ec6044,778
-torchTextClassifiers/classifiers/fasttext/core.py,sha256=0b9d27c67f8eedbf6e9425943b10404bb6763709190351df01667ce3fc32f7f6,9943
-torchTextClassifiers/classifiers/fasttext/model.py,sha256=4a3cd5b5403c5437e5c7d953dbc0a44b8e57ce5918b32b8b50227a8449c441b2,29858
-torchTextClassifiers/classifiers/fasttext/tokenizer.py,sha256=d58c1ac0cbf7e62d21f3277a5fcb77fe9c7e74551df600843ce82fab5ad5664b,11422
-torchTextClassifiers/classifiers/fasttext/wrapper.py,sha256=372903cb9313f8f79791ea4664226c10cffc4d2ec41f657153645de6339cbbfb,8816
-torchTextClassifiers/classifiers/simple_text_classifier.py,sha256=d81afd256d451de212646bc99f8d8f790fb9e144c8fd93f44085acaed8c68be3,6725
-torchTextClassifiers/factories.py,sha256=608d545d55be38ecbd89e80ff655140e4d7b3ae1696d6c1d3812fea2dddde88d,1296
-torchTextClassifiers/torchTextClassifiers.py,sha256=fca4f7ca881d9d76711892c38ac6548f38d8376ad05878fabfbe9b08ca49090d,20496
-torchTextClassifiers/utilities/__init__.py,sha256=17df83700c131f2f4b5acc619ccafa0dcb55139f2a27cf00f6c682880a2b3746,21
-torchTextClassifiers/utilities/checkers.py,sha256=53494be4b95691090f70fda5498cc11f05adac042617d5da114ea60ea3e35444,3733
-torchTextClassifiers/utilities/preprocess.py,sha256=bba939a19a82e5ebc49509f2c8c5716b71975d502babbe89b236470655295390,2230
-torchTextClassifiers/utilities/utils.py,sha256=81ff0aeee829c0729d9eb1b37d7bc6e37d4bec0e65dbd199482e8da9584663ac,13567
-torchtextclassifiers-0.0.1.dist-info/WHEEL,sha256=b70116f4076fa664af162441d2ba3754dbb4ec63e09d563bdc1e9ab023cce400,78
-torchtextclassifiers-0.0.1.dist-info/METADATA,sha256=48862621e58dace60467867aef55399bda366a33ccd0c1bc7080a7ac60d05a39,4990
-torchtextclassifiers-0.0.1.dist-info/RECORD,,

torchtextclassifiers 0.0.1__py3-none-any.whl → 0.1.0__py3-none-any.whl

torchtextclassifiers 0.0.1py3-none-any.whl → 0.1.0py3-none-any.whl