PyPI - wolof-translate - Versions diffs - 0.0.1__py3-none-any.whl - Mend

wolof-translate 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

wolof_translate/__init__.py +73 -0
wolof_translate/data/__init__.py +0 -0
wolof_translate/data/dataset_v1.py +151 -0
wolof_translate/data/dataset_v2.py +187 -0
wolof_translate/data/dataset_v3.py +187 -0
wolof_translate/data/dataset_v3_2.py +187 -0
wolof_translate/data/dataset_v4.py +202 -0
wolof_translate/data/dataset_v5.py +65 -0
wolof_translate/models/__init__.py +0 -0
wolof_translate/models/transformers/__init__.py +0 -0
wolof_translate/models/transformers/main.py +865 -0
wolof_translate/models/transformers/main_2.py +362 -0
wolof_translate/models/transformers/optimization.py +41 -0
wolof_translate/models/transformers/position.py +46 -0
wolof_translate/models/transformers/size.py +44 -0
wolof_translate/pipe/__init__.py +1 -0
wolof_translate/pipe/nlp_pipeline.py +512 -0
wolof_translate/tokenizers/__init__.py +0 -0
wolof_translate/trainers/__init__.py +0 -0
wolof_translate/trainers/transformer_trainer.py +760 -0
wolof_translate/trainers/transformer_trainer_custom.py +882 -0
wolof_translate/trainers/transformer_trainer_ml.py +925 -0
wolof_translate/trainers/transformer_trainer_ml_.py +1042 -0
wolof_translate/utils/__init__.py +1 -0
wolof_translate/utils/bucket_iterator.py +143 -0
wolof_translate/utils/database_manager.py +116 -0
wolof_translate/utils/display_predictions.py +162 -0
wolof_translate/utils/download_model.py +40 -0
wolof_translate/utils/evaluate_custom.py +147 -0
wolof_translate/utils/evaluation.py +74 -0
wolof_translate/utils/extract_new_sentences.py +810 -0
wolof_translate/utils/extract_poems.py +60 -0
wolof_translate/utils/extract_sentences.py +562 -0
wolof_translate/utils/improvements/__init__.py +0 -0
wolof_translate/utils/improvements/end_marks.py +45 -0
wolof_translate/utils/recuperate_datasets.py +94 -0
wolof_translate/utils/recuperate_datasets_trunc.py +85 -0
wolof_translate/utils/send_model.py +26 -0
wolof_translate/utils/sent_corrections.py +169 -0
wolof_translate/utils/sent_transformers.py +27 -0
wolof_translate/utils/sent_unification.py +97 -0
wolof_translate/utils/split_with_valid.py +72 -0
wolof_translate/utils/tokenize_text.py +46 -0
wolof_translate/utils/training.py +213 -0
wolof_translate/utils/trunc_hg_training.py +196 -0
wolof_translate-0.0.1.dist-info/METADATA +31 -0
wolof_translate-0.0.1.dist-info/RECORD +49 -0
wolof_translate-0.0.1.dist-info/WHEEL +5 -0
wolof_translate-0.0.1.dist-info/top_level.txt +1 -0

wolof_translate/utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ # from wolof_translate.utils.tokenize_text import tokenization

wolof_translate/utils/bucket_iterator.py ADDED Viewed

@@ -0,0 +1,143 @@
+import torch
+import numpy as np
+from typing import *
+from torch.utils.data import Sampler
+from torch.nn.utils.rnn import pad_sequence
+from math import ceil
+class SequenceLengthBatchSampler(Sampler):
+    def __init__(self, dataset, boundaries, batch_sizes, input_key = None, label_key = None, drop_unique = True):
+        self.dataset = dataset
+        self.boundaries = boundaries
+        self.batch_sizes = batch_sizes
+        self.data_info = {}
+        self.drop_unique = drop_unique
+        # Initialize dictionary with indices and element lengths
+        for i in range(len(dataset)):
+            data = dataset[i]
+            length = (
+                max(len(data[0]), len(data[2]))
+                if (input_key is None and label_key is None)
+                else max(len(data[input_key]), len(data[label_key]))
+            )
+            self.data_info[i] = {"index": i, "length": length}
+        self.calculate_length()
+    def calculate_length(self):
+        self.batches = []
+        # Sort indices based on element length
+        sorted_indices = sorted(self.data_info.keys(), key=lambda i: self.data_info[i]["length"])
+        # Group indices into batches of sequences with the same length
+        for boundary in self.boundaries:
+            batch = [i for i in sorted_indices if self.data_info[i]["length"] <= boundary]  # Filter indices based on length boundary
+            self.batches.append(batch)
+            sorted_indices = [i for i in sorted_indices if i not in batch]  # Remove processed indices
+        # Add remaining indices to the last batch
+        self.batches.append(sorted_indices)
+        # Calculate the total length of the data loader
+        self.length = sum(ceil(len(batch) / batch_size) for batch, batch_size in zip(self.batches, self.batch_sizes) if len(batch) % batch_size != 1 or not self.drop_unique)
+    def __iter__(self):
+#         indices = list(self.data_info.keys())  # Get indices from the data_info dictionary
+#         np.random.shuffle(indices)  # Shuffle the indices
+        # Yield batches with the corresponding batch sizes
+        for batch_indices, batch_size in zip(self.batches, self.batch_sizes):
+            num_batches = len(batch_indices) // batch_size
+            for i in range(num_batches):
+                # Recuperate the current bucket
+                current_bucket = batch_indices[i * batch_size: (i + 1) * batch_size]
+                # Shuffle the current bucket
+                np.random.shuffle(current_bucket)
+                # Yield the current bucket
+                yield [self.data_info[i]["index"] for i in current_bucket]
+            remaining_indices = len(batch_indices) % batch_size
+            if remaining_indices > 0 and remaining_indices != 1 or not self.drop_unique:
+                # Recuperate the current bucket
+                current_bucket = batch_indices[-remaining_indices:]
+                # Shuffle the current bucket
+                np.random.shuffle(current_bucket)
+                # Yield the current bucket
+                yield [self.data_info[i]["index"] for i in batch_indices[-remaining_indices:]]
+    def __len__(self):
+        return self.length
+class BucketSampler(Sampler):
+    def __init__(self, dataset, batch_size, sort_key=lambda x, index_1, index_2: max(len(x[index_1]), len(x[index_2])), input_key: Union[str, int] = 0, label_key: Union[str, int] = 1):
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.sort_key = sort_key
+        self.index_1 = input_key
+        self.index_2 = label_key
+        indices = np.argsort([self.sort_key(self.dataset[i], self.index_1, self.index_2) for i in range(len(self.dataset))])
+        self.batches = [indices[i:i + self.batch_size] for i in range(0, len(indices), self.batch_size)]
+    def __iter__(self):
+        if self.batch_size > 1:
+            np.random.shuffle(self.batches)
+        for batch in self.batches:
+            yield batch.tolist()
+    def __len__(self):
+        return ceil(len(self.dataset) / self.batch_size)
+def collate_fn(batch):
+    from torch.nn.utils.rnn import pad_sequence
+    # Separate the input sequences, target sequences, and attention masks
+    input_seqs, input_masks, target_seqs, target_masks = zip(*batch)
+    # Pad the input sequences to have the same length
+    padded_input_seqs = pad_sequence(input_seqs, batch_first=True)
+    # Pad the target sequences to have the same length
+    padded_target_seqs = pad_sequence(target_seqs, batch_first=True)
+    # Pad the input masks to have the same length
+    padded_input_masks = pad_sequence(input_masks, batch_first=True)
+    # Pad the labels masks to have the same length
+    padded_target_masks = pad_sequence(target_masks, batch_first=True)
+    return padded_input_seqs, padded_input_masks, padded_target_seqs, padded_target_masks
+def collate_fn_trunc(batch, max_len, eos_token_id, pad_token_id):
+    from torch.nn.utils.rnn import pad_sequence
+    # Separate the input sequences, target sequences, and attention masks
+    input_seqs, input_masks, target_seqs, target_masks = zip(*batch)
+    # Pad the input sequences to have the same length
+    padded_input_seqs = pad_sequence(input_seqs, batch_first=True)[:,:max_len]
+    # Pad the target sequences to have the same length
+    padded_target_seqs = pad_sequence(target_seqs, batch_first=True)[:,:max_len]
+    # add eos_token id if pad token id is not visible
+    padded_input_seqs[:, -1:][(padded_input_seqs[:, -1:] != eos_token_id) & (padded_input_seqs[:, -1:] != pad_token_id)] = eos_token_id
+    padded_target_seqs[:, -1:][(padded_target_seqs[:, -1:] != eos_token_id) & (padded_target_seqs[:, -1:] != pad_token_id)] = eos_token_id
+    # Pad the input masks to have the same length
+    padded_input_masks = pad_sequence(input_masks, batch_first=True)[:,:max_len]
+    # Pad the labels masks to have the same length
+    padded_target_masks = pad_sequence(target_masks, batch_first=True)[:,:max_len]
+    return padded_input_seqs, padded_input_masks, padded_target_seqs, padded_target_masks

wolof_translate/utils/database_manager.py ADDED Viewed

@@ -0,0 +1,116 @@
+from pymongo.mongo_client import MongoClient
+from pymongo.server_api import ServerApi
+import pandas as pd
+class TranslationMongoDBManager:
+    def __init__(self, uri: str, database: str):
+        # recuperate the client
+        self.client = MongoClient(uri)
+        # recuperate the database
+        self.db = self.client.get_database(database)
+    def insert_documents(self, documents: list, collection: str = "sentences"):
+        # insert documents inside a collection
+        results = self.db[collection].insert_many(documents)
+        return results
+    def insert_document(self, document: dict, collection: str = "sentences"):
+        assert not "_id" in document
+        # get the id of the last sentence (recuperate the max id and add 1 to it)
+        max_id = self.get_max_id(collection)
+        # add the new sentences
+        document["_id"] = max_id + 1
+        results = self.db[collection].insert_one(document)
+        return results
+    def update_document(
+        self,
+        id: int,
+        document: dict,
+        collection: str = "sentences",
+        update_collection: str = "updated",
+    ):
+        # recuperate the document to update
+        upd_sent = self.db[collection].find_one({"_id": {"$eq": id}})
+        # delete the document
+        self.db[collection].update_one(
+            {"_id": {"$eq": upd_sent["_id"]}}, {"$set": document}
+        )
+        # add the sentences to the deleted sentences
+        upd_sent["_id"] = len(list(self.db[update_collection].find()))
+        results = self.db[update_collection].insert_one(upd_sent)
+        return results
+    def delete_document(
+        self, id: int, collection: str = "sentences", del_collection: str = "deleted"
+    ):
+        # recuperate the document to delete
+        del_sent = self.db[collection].find_one({"_id": {"$eq": id}})
+        # delete the sentence
+        self.db[collection].delete_one({"_id": {"$eq": del_sent["_id"]}})
+        # add the sentences to the deleted sentences
+        del_sent["_id"] = len(list(self.db[del_collection].find()))
+        results = self.db[del_collection].insert_one(del_sent)
+        return results
+    def get_max_id(self, collection: str = "sentences"):
+        # recuperate the maximum id
+        id = list(self.db[collection].find().sort("_id", -1).limit(1))[0]["_id"]
+        return id
+    def save_data_frames(
+        self,
+        sentences_path: str,
+        deleted_path: str,
+        collection: str = "sentences",
+        del_collection: str = "deleted",
+    ):
+        # recuperate the new corpora
+        new_corpora = pd.DataFrame(list(self.db[collection].find()))
+        # recuperate the deleted sentences as a Data Frame
+        deleted_df = pd.DataFrame(list(self.db[del_collection].find()))
+        # save the data frames as csv files
+        new_corpora.set_index("_id", inplace=True)
+        deleted_df.set_index("_id", inplace=True)
+        new_corpora.to_csv(sentences_path, index=False)
+        deleted_df.to_csv(deleted_path, index=False)
+    def load_data_frames(
+        self, collection: str = "sentences", del_collection: str = "deleted"
+    ):
+        # recuperate the new corpora
+        new_corpora = pd.DataFrame(list(self.db[collection].find()))
+        # recuperate the deleted sentences as a Data Frame
+        deleted_df = pd.DataFrame(list(self.db[del_collection].find()))
+        return new_corpora, deleted_df

wolof_translate/utils/display_predictions.py ADDED Viewed

@@ -0,0 +1,162 @@
+import plotly.graph_objects as go
+from tabulate import tabulate
+import plotly.io as pio
+import pandas as pd
+import textwrap
+def display_samples(
+    data_frame: pd.DataFrame,
+    n_samples: int = 40,
+    seed: int = 0,
+    header_color: str = "paleturquoise",
+    cells_color: str = "lavender",
+    width: int = 600,
+    height: int = 1000,
+    save_sample: bool = True,
+    table_caption: str = "",
+    label: str = "",
+    filename: str = "samples.csv",
+    lang: str = "eng",
+    show: bool = True,
+):
+    """Display a random sample of the data frame.
+    Args:
+        data_frame (pd.DataFrame): The data frame to display.
+        n_samples (int, optional): The number of samples. Defaults to 40.
+        seed (int, optional): The generator' seed. Defaults to 0.
+        header_color (str, optional): The header color. Defaults to 'paleturquoise'.
+        cells_color (str, optional): The cells' color. Defaults to 'lavender'.
+        width (int): The width of the figure. Defaults to 600.
+        height (int): The height of the figure. Defaults to 300.
+        lang (str): The language: 'fr' for french or 'eng' for english. Defaults to 'eng'.
+    Returns:
+        : The figure.
+    """
+    # get the samples from the data frame
+    # samples = data_frame.sample(n_samples, random_state = seed).tail(13)
+    samples = data_frame.sample(n_samples, random_state=seed)
+    if lang == "fr":
+        samples.columns = ["Phrases Originales", "Target Sentences", "Prédictions"]
+    elif lang == "eng":
+        samples.columns = ["Source Sentences", "Target Sentences", "Predictions"]
+    # trace the figure
+    fig = go.Figure(
+        data=go.Table(
+            header=dict(
+                values=list(samples.columns),
+                fill_color=header_color,
+                align="center",
+                font=dict(size=14, color="black"),  # Header font style
+                height=40,
+            ),
+            cells=dict(
+                values=[samples[col] for col in samples.columns],
+                fill_color=cells_color,
+                line=dict(color="rgb(204, 204, 204)", width=1),
+                font=dict(size=12, color="black"),
+                height=30,
+                align="left",
+            ),
+            columnwidth=[400, 400, 400],
+        )
+    )
+    # Customize the table layout
+    fig.update_layout(
+        width=width,  # Set the overall table width
+        height=height,  # Set the overall table height
+        margin=dict(l=0, r=0, t=0, b=0),  # Remove margin
+        paper_bgcolor="rgba(0,0,0,0)",  # Transparent background
+        plot_bgcolor="rgba(0,0,0,0)",  # Transparent plot area background
+    )
+    # display the figure
+    if show:
+        fig.show()
+    # save the latex script to create the table in latex
+    if save_sample:
+        samples.to_csv(f"{filename}_{lang}.csv", index=False, encoding="utf-16")
+    return fig
+def save_go_figure_as_image(
+    fig, path: str, scale: int = 3, width: int = 600, height: int = 1000
+):
+    # save the figure as a image
+    pio.write_image(fig, path, format="png", scale=scale, width=width, height=height)
+def escape_latex(text):
+    """
+    Escape special characters in text for LaTeX.
+    """
+    special_chars = ["_", "&", "%", "$", "#", "{", "}", "~", "^"]
+    for char in special_chars:
+        text = text.replace(char, "\\" + char)
+    return text
+def wrap_text(text, max_width):
+    """
+    Wrap long text to fit into the table cells.
+    """
+    return "\n".join(textwrap.wrap(text, width=max_width))
+def save_latex_table(
+    data_frame,
+    table_caption="",
+    label="",
+    filename="table.tex",
+    max_cell_width: int = 100,
+    wrap_long_text: bool = True,
+):
+    """
+    Convert a pandas DataFrame to a LaTeX table and save it to a file.
+    Parameters:
+        data_frame (pandas.DataFrame): The DataFrame to convert to LaTeX table.
+        table_caption (str): Optional caption for the table.
+        label (str): Optional label for referencing the table in the document.
+        filename (str): The name of the file to save the LaTeX code.
+    Returns:
+        None
+    """
+    # Convert the DataFrame to a LaTeX tabular representation
+    latex_table = data_frame.to_latex(
+        index=False, escape=False, column_format="p{.425\linewidth}p{.425\linewidth}"
+    )
+    # Modify the LaTeX tabular representation to include the caption and label, and add necessary formatting
+    latex_table = (
+        "\\begin{table}\n"
+        "  \\centering\n"
+        "  \\small\n"
+        f"  \\caption{{{table_caption}}}\n\n"
+        "  \\begin{tabular}{*{3}{p{.425\\linewidth}}}\n"
+        "    \\toprule\n"
+        "    Emociones primarias &  Derivación de las emociones primarias \\\\\n"
+        "    \\midrule\n"
+        f"{latex_table}"
+        "    \\bottomrule\n"
+        "  \\end{tabular}\n"
+        f"  \\label{{{label}}}\n"
+        "\\end{table}"
+    )
+    with open(filename, "w", encoding="utf-8") as f:
+        f.write(latex_table)

wolof_translate/utils/download_model.py ADDED Viewed

@@ -0,0 +1,40 @@
+import shutil
+import wandb
+import glob
+import os
+def transfer_model(artifact_dir: str, model_name: str):
+    """Transfer a download artifact into another directory
+    Args:
+        artifact_dir (str): The directory of the artifact
+        model_name (str): The name of the model
+    """
+    # transfer the model inside the artifact to data/checkpoints/name_of_model
+    os.makedirs(model_name, exist_ok=True)
+    for file in glob.glob(f"{artifact_dir}/*"):
+        shutil.copy(file, model_name)
+    # delete the artifact
+    shutil.rmtree(artifact_dir)
+def download_artifact(artifact_name: str, model_name: str, type_: str = "dataset"):
+    """This function download an artifact from weights and bias and store it into a directory
+    Args:
+        artifact_name (str): name of the artifact
+        model_name (str): name of the model
+        type (str): type of the artifact. Default to 'directory'.
+    """
+    # download wandb model
+    run = wandb.init()
+    artifact = run.use_artifact(artifact_name, type=type_)
+    artifact_dir = artifact.download()
+    # transfer the artifact into another directory
+    transfer_model(artifact_dir, model_name)
+    # finish wandb
+    wandb.finish()

wolof_translate/utils/evaluate_custom.py ADDED Viewed

@@ -0,0 +1,147 @@
+from tokenizers import Tokenizer
+from typing import *
+import numpy as np
+import evaluate
+class TranslationEvaluation:
+    def __init__(
+        self,
+        tokenizer: Tokenizer,
+        decoder: Union[Callable, None] = None,
+        next_gen: bool = False,
+    ):
+        self.tokenizer = tokenizer
+        self.decoder = decoder
+        self.bleu = evaluate.load("sacrebleu")
+        self.rouge = evaluate.load("rouge")
+        self.accuracy = evaluate.load("accuracy")
+        self.next_gen = next_gen
+    def postprocess_text(self, preds, labels):
+        for i in range(len(labels)):
+            pred = preds[i].strip()
+            label = labels[i].strip()
+            if self.next_gen:
+                new_pred = ""
+                new_label = ""
+                for j in range(len(labels[i])):
+                    if pred[:j] != labels[i][:j]:
+                        new_pred = pred[j:]
+                        new_label = label[j:]
+                        break
+                preds[i] = new_pred
+                labels[i] = new_label
+            else:
+                preds[i] = pred
+                labels[i] = [label]
+        return preds, labels
+    def postprocess_codes(self, preds: np.ndarray, labels: np.ndarray):
+        label_weights = (labels != 0).astype(float).tolist()
+        preds = preds.tolist()
+        labels = labels.tolist()
+        return preds, labels, label_weights
+    def compute_metrics(
+        self, eval_preds, rouge: bool = True, bleu: bool = True, accuracy: bool = False
+    ):
+        preds, labels = eval_preds
+        if isinstance(preds, tuple):
+            preds = preds[0]
+        decoded_preds = (
+            self.tokenizer.batch_decode(preds, skip_special_tokens=True)
+            if not self.decoder
+            else self.decoder(preds)
+        )
+        decoded_labels = (
+            self.tokenizer.batch_decode(labels, skip_special_tokens=True)
+            if not self.decoder
+            else self.decoder(labels)
+        )
+        result = {}
+        if accuracy:
+            pred_codes, label_codes, sample_weight = self.postprocess_codes(
+                preds, labels
+            )
+            accuracy_result = np.mean(
+                [
+                    self.accuracy.compute(
+                        predictions=pred_codes[i],
+                        references=label_codes[i],
+                        sample_weight=sample_weight[i],
+                    )["accuracy"]
+                    for i in range(len(pred_codes))
+                ]
+            )
+            result["accuracy"] = accuracy_result
+        if bleu or rouge:
+            decoded_preds, decoded_labels = self.postprocess_text(
+                decoded_preds, decoded_labels
+            )
+        if bleu:
+            bleu_result = self.bleu.compute(
+                predictions=decoded_preds, references=decoded_labels
+            )
+            result["bleu"] = bleu_result["score"]
+        if rouge:
+            rouge_result = self.rouge.compute(
+                predictions=decoded_preds, references=decoded_labels
+            )
+            result.update(rouge_result)
+        prediction_lens = [
+            np.count_nonzero(np.array(pred) != self.tokenizer.pad_token_id)
+            for pred in preds
+        ]
+        result["gen_len"] = np.mean(prediction_lens)
+        result = {k: round(v, 4) for k, v in result.items()}
+        return result

wolof_translate/utils/evaluation.py ADDED Viewed

@@ -0,0 +1,74 @@
+from tokenizers import Tokenizer
+from typing import *
+import numpy as np
+import evaluate
+class TranslationEvaluation:
+    def __init__(
+        self,
+        tokenizer: Tokenizer,
+        decoder: Union[Callable, None] = None,
+        metric=evaluate.load("sacrebleu"),
+    ):
+        self.tokenizer = tokenizer
+        self.decoder = decoder
+        self.metric = metric
+    def postprocess_text(self, preds, labels):
+        preds = [pred.strip() for pred in preds]
+        for label in labels:
+            print(label)
+        labels = [[label.strip()] for label in labels]
+        return preds, labels
+    def compute_metrics(self, eval_preds):
+        preds, labels = eval_preds
+        if isinstance(preds, tuple):
+            preds = preds[0]
+        decoded_preds = (
+            self.tokenizer.batch_decode(preds, skip_special_tokens=True)
+            if not self.decoder
+            else self.decoder(preds)
+        )
+        labels = np.where(labels != -100, labels, self.tokenizer.pad_token_id)
+        decoded_labels = (
+            self.tokenizer.batch_decode(labels, skip_special_tokens=True)
+            if not self.decoder
+            else self.decoder(labels)
+        )
+        decoded_preds, decoded_labels = self.postprocess_text(
+            decoded_preds, decoded_labels
+        )
+        result = self.metric.compute(
+            predictions=decoded_preds, references=decoded_labels
+        )
+        result = {"bleu": result["score"]}
+        prediction_lens = [
+            np.count_nonzero(np.array(pred) != self.tokenizer.pad_token_id)
+            for pred in preds
+        ]
+        result["gen_len"] = np.mean(prediction_lens)
+        result = {k: round(v, 4) for k, v in result.items()}
+        return result