PyPI - renard-pipeline - Versions diffs - 0.4.2__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

renard-pipeline 0.4.2py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of renard-pipeline might be problematic. Click here for more details.

Files changed (23) hide show

renard/graph_utils.py +11 -4
renard/ner_utils.py +24 -14
renard/pipeline/character_unification.py +62 -19
renard/pipeline/characters_extraction.py +3 -1
renard/pipeline/core.py +141 -26
renard/pipeline/corefs/corefs.py +32 -33
renard/pipeline/graph_extraction.py +281 -192
renard/pipeline/ner/__init__.py +1 -0
renard/pipeline/{ner.py → ner/ner.py} +47 -76
renard/pipeline/ner/retrieval.py +375 -0
renard/pipeline/progress.py +32 -1
renard/pipeline/speaker_attribution.py +2 -3
renard/pipeline/tokenization.py +59 -30
renard/plot_utils.py +48 -28
renard/resources/determiners/__init__.py +1 -0
renard/resources/determiners/determiners.py +41 -0
renard/resources/hypocorisms/hypocorisms.py +3 -2
renard/utils.py +57 -1
{renard_pipeline-0.4.2.dist-info → renard_pipeline-0.6.0.dist-info}/METADATA +45 -20
renard_pipeline-0.6.0.dist-info/RECORD +39 -0
renard_pipeline-0.4.2.dist-info/RECORD +0 -35
{renard_pipeline-0.4.2.dist-info → renard_pipeline-0.6.0.dist-info}/LICENSE +0 -0
{renard_pipeline-0.4.2.dist-info → renard_pipeline-0.6.0.dist-info}/WHEEL +0 -0

renard/pipeline/ner/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from renard.pipeline.ner.ner import *

renard/pipeline/{ner.py → ner/ner.py} RENAMED Viewed

@@ -1,21 +1,32 @@
 from __future__ import annotations
-import random, itertools
-from typing import TYPE_CHECKING, List, Dict, Any, Set, Tuple, Optional, Union, Literal
+from typing import (
+    TYPE_CHECKING,
+    List,
+    Dict,
+    Any,
+    Set,
+    Tuple,
+    Optional,
+    Union,
+    Literal,
+)
 from dataclasses import dataclass
 import torch
-from seqeval.metrics import precision_score, recall_score, f1_score
 from renard.nltk_utils import nltk_fix_bio_tags
 from renard.ner_utils import (
     DataCollatorForTokenClassificationWithBatchEncoding,
     NERDataset,
 )
 from renard.pipeline.core import PipelineStep, Mention
-from renard.pipeline.progress import ProgressReporter
 from renard.ner_utils import ner_entities
 if TYPE_CHECKING:
     from transformers.tokenization_utils_base import BatchEncoding
-    from transformers import PreTrainedModel, PreTrainedTokenizerFast
+    from transformers import (
+        PreTrainedModel,
+        PreTrainedTokenizerFast,
+    )
+    from renard.pipeline.ner.retrieval import NERContextRetriever
 @dataclass
@@ -27,7 +38,7 @@ class NEREntity(Mention):
         """
         .. note::
-            This method is implemtented here to avoid type issues.  Since
+            This method is implemented here to avoid type issues.  Since
             :meth:`.Mention.shifted` cannot be annotated as returning
             ``Self``, this method annotate the correct return type when
             using :meth:`.NEREntity.shifted`.
@@ -41,18 +52,21 @@ class NEREntity(Mention):
 def score_ner(
     pred_bio_tags: List[str], ref_bio_tags: List[str]
 ) -> Tuple[float, float, float]:
-    """Score NER as in CoNLL-2003 shared task using ``seqeval``
+    """Score NER as in CoNLL-2003 shared task using the ``seqeval``
+    library, if installed.
     Precision is the percentage of named entities in ``ref_bio_tags``
-    that are correct. Recall is the percentage of named entities in
-    pred_bio_tags that are in ref_bio_tags. F1 is the harmonic mean of
-    both.
+    that are correct.  Recall is the percentage of named entities in
+    pred_bio_tags that are in ref_bio_tags.  F1 is the harmonic mean
+    of both.
     :param pred_bio_tags:
     :param ref_bio_tags:
-    :return: ``(precision, recall, F1 score)``
+    :return: ``(precision, recall, F1 score)``
     """
+    from seqeval.metrics import precision_score, recall_score, f1_score
     assert len(pred_bio_tags) == len(ref_bio_tags)
     return (
         precision_score([ref_bio_tags], [pred_bio_tags]),
@@ -70,12 +84,19 @@ class NLTKNamedEntityRecognizer(PipelineStep):
         """
         import nltk
-        nltk.download("averaged_perceptron_tagger", quiet=True)
+        nltk.download(f"averaged_perceptron_tagger", quiet=True)
         nltk.download("maxent_ne_chunker", quiet=True)
+        nltk.download("maxent_ne_chunker_tab", quiet=True)
         nltk.download("words", quiet=True)
         super().__init__()
+    def _pipeline_init_(self, lang: str, **kwargs):
+        import nltk
+        nltk.download(f"averaged_perceptron_tagger_{lang}", quiet=True)
+        super()._pipeline_init_(lang, **kwargs)
     def __call__(self, tokens: List[str], **kwargs) -> Dict[str, Any]:
         """
         :param text:
@@ -101,64 +122,6 @@ class NLTKNamedEntityRecognizer(PipelineStep):
         return {"entities"}
-class NERContextRetriever:
-    def __call__(self, dataset: NERDataset) -> NERDataset:
-        raise NotImplementedError
-class NERSamenounContextRetriever(NERContextRetriever):
-    """
-    Retrieve relevant context using the samenoun strategy as in
-    Amalvy et al.  2023.
-    """
-    def __init__(self, k: int) -> None:
-        """
-        :param k: the number of sentences to retrieve
-        """
-        self.k = k
-    def __call__(self, dataset: NERDataset) -> NERDataset:
-        import nltk
-        # NOTE: POS tagging is not incorporated in the pipeline yet,
-        # so we manually compute it here.
-        elements_names = [
-            {t[0] for t in nltk.pos_tag(element) if t[1].startswith("NN")}
-            for element in dataset.elements
-        ]
-        elements_with_context = []
-        for elt_i, elt in enumerate(dataset.elements):
-            retrieved_elts = [
-                other_elt
-                for other_elt_i, other_elt in enumerate(dataset.elements)
-                if not other_elt_i == elt_i
-                and len(elements_names[elt_i].intersection(elements_names[other_elt_i]))
-                > 0
-            ]
-            retrieved_elts = random.sample(
-                retrieved_elts, k=min(self.k, len(retrieved_elts))
-            )
-            elements_with_context.append(
-                (
-                    elt,
-                    [dataset.tokenizer.sep_token]
-                    + list(itertools.chain.from_iterable(retrieved_elts)),
-                )
-            )
-        return NERDataset(
-            [element + context for element, context in elements_with_context],
-            dataset.tokenizer,
-            [
-                [0] * len(element) + [1] * len(context)
-                for element, context in elements_with_context
-            ],
-        )
 class BertNamedEntityRecognizer(PipelineStep):
     """An entity recognizer based on BERT"""
@@ -214,10 +177,10 @@ class BertNamedEntityRecognizer(PipelineStep):
         super().__init__()
-    def _pipeline_init_(self, lang: str, progress_reporter: ProgressReporter):
+    def _pipeline_init_(self, lang: str, **kwargs):
         from transformers import AutoModelForTokenClassification, AutoTokenizer  # type: ignore
-        super()._pipeline_init_(lang, progress_reporter)
+        super()._pipeline_init_(lang, **kwargs)
         # init model if needed (this happens if the user did not pass
         # the instance of a model)
@@ -306,7 +269,7 @@ class BertNamedEntityRecognizer(PipelineStep):
         batch_i: int,
         wp_labels: List[str],
         tokens: List[str],
-        context_mask: torch.Tensor,
+        ctxmask: torch.Tensor,
     ) -> List[str]:
         """Align labels to tokens rather than wordpiece tokens.
@@ -317,13 +280,21 @@ class BertNamedEntityRecognizer(PipelineStep):
         """
         batch_labels = ["O"] * len(tokens)
+        try:
+            inference_start = ctxmask[batch_i].tolist().index(1)
+        except ValueError:
+            inference_start = 0
         for wplabel_j, wp_label in enumerate(wp_labels):
-            if context_mask[batch_i][wplabel_j] == 1:
-                continue
             token_i = batchs.token_to_word(batch_i, wplabel_j)
             if token_i is None:
                 continue
-            batch_labels[token_i] = wp_label
+            if ctxmask[batch_i][token_i] == 0:
+                continue
+            batch_labels[token_i - inference_start] = wp_label
         return batch_labels

renard/pipeline/ner/retrieval.py ADDED Viewed

@@ -0,0 +1,375 @@
+from collections.abc import Set
+import sys
+from typing import Union, List, cast, Literal, Optional
+import random
+from dataclasses import dataclass
+from more_itertools import flatten
+from renard.ner_utils import NERDataset
+import nltk
+from rank_bm25 import BM25Okapi
+from transformers import (
+    BertForSequenceClassification,
+    BertTokenizerFast,
+    DataCollatorWithPadding,
+)
+from transformers.tokenization_utils_base import BatchEncoding
+import torch
+from torch.utils.data import Dataset, DataLoader
+@dataclass
+class NERContextRetrievalMatch:
+    element: List[str]
+    element_i: int
+    side: Literal["left", "right"]
+    score: Optional[float]
+    def __hash__(self) -> int:
+        return hash(tuple(self.element) + (self.element_i, self.side, self.score))
+class NERContextRetriever:
+    def __init__(self, k: int) -> None:
+        self.k = k
+    def compute_global_features(self, elements: List[List[str]]) -> dict:
+        return {}
+    def retrieve(
+        self, element_i: int, elements: List[List[str]], **kwargs
+    ) -> List[NERContextRetrievalMatch]:
+        raise NotImplementedError
+    def __call__(self, dataset: NERDataset) -> NERDataset:
+        # [(left_ctx, element, right_ctx), ...]
+        elements_with_context = []
+        global_features = self.compute_global_features(dataset.elements)
+        for elt_i, elt in enumerate(dataset.elements):
+            matchs = self.retrieve(elt_i, dataset.elements, **global_features)
+            assert len(matchs) <= self.k
+            lctx = sorted(
+                (m for m in matchs if m.side == "left"),
+                key=lambda m: m.element_i,
+            )
+            lctx = list(flatten([m.element for m in lctx]))
+            rctx = sorted(
+                (m for m in matchs if m.side == "right"),
+                key=lambda m: m.element_i,
+            )
+            rctx = list(flatten([m.element for m in rctx]))
+            elements_with_context.append((lctx, elt, rctx))
+        return NERDataset(
+            [lctx + element + rctx for lctx, element, rctx in elements_with_context],
+            dataset.tokenizer,
+            [
+                [1] * len(lctx) + [0] * len(element) + [1] * len(rctx)
+                for lctx, element, rctx in elements_with_context
+            ],
+        )
+class NERSamenounContextRetriever(NERContextRetriever):
+    """
+    Retrieve relevant context using the samenoun strategy as in
+    Amalvy et al.  2023.
+    """
+    def __init__(self, k: int) -> None:
+        """
+        :param k: the max number of sentences to retrieve
+        """
+        super().__init__(k)
+    def compute_global_features(self, elements: List[List[str]]) -> dict:
+        return {
+            "NNs": [
+                {t[0] for t in nltk.pos_tag(element) if t[1] == "NN"}
+                for element in elements
+            ]
+        }
+    def retrieve(
+        self, element_i: int, elements: List[List[str]], NNs: List[Set[str]], **kwargs
+    ) -> List[NERContextRetrievalMatch]:
+        matchs = [
+            NERContextRetrievalMatch(
+                other_elt,
+                other_elt_i,
+                "left" if other_elt_i < element_i else "right",
+                None,
+            )
+            for other_elt_i, other_elt in enumerate(elements)
+            if not other_elt_i == element_i
+            and len(NNs[element_i].intersection(NNs[other_elt_i])) > 0  # type: ignore
+        ]
+        return random.sample(matchs, k=min(self.k, len(matchs)))
+class NERNeighborsContextRetriever(NERContextRetriever):
+    """A context retriever that chooses nearby elements."""
+    def __init__(self, k: int):
+        assert k % 2 == 0
+        super().__init__(k)
+    def retrieve(
+        self, element_i: int, elements: List[List[str]], **kwargs
+    ) -> List[NERContextRetrievalMatch]:
+        left_nb = self.k // 2
+        right_nb = left_nb
+        lctx = []
+        for i, elt in enumerate(elements[element_i - left_nb : element_i]):
+            lctx.append(
+                NERContextRetrievalMatch(elt, element_i - left_nb + i, "left", None)
+            )
+        rctx = []
+        for i, elt in enumerate(elements[element_i + 1 : element_i + 1 + right_nb]):
+            rctx.append(NERContextRetrievalMatch(elt, element_i + 1 + i, "right", None))
+        return lctx + rctx
+class NERBM25ContextRetriever(NERContextRetriever):
+    """A context retriever that selects elements according to the BM25 ranking formula."""
+    def __init__(self, k: int) -> None:
+        super().__init__(k)
+    def compute_global_features(self, elements: List[List[str]]) -> dict:
+        return {"bm25_model": BM25Okapi(elements)}
+    def retrieve(
+        self, element_i: int, elements: List[List[str]], bm25_model: BM25Okapi, **kwargs
+    ) -> List[NERContextRetrievalMatch]:
+        query = elements[element_i]
+        sent_scores = bm25_model.get_scores(query)
+        sent_scores[element_i] = float("-Inf")  # don't retrieve self
+        topk_values, topk_indexs = torch.topk(
+            torch.tensor(sent_scores), k=min(self.k, len(sent_scores)), dim=0
+        )
+        return [
+            NERContextRetrievalMatch(
+                elements[index], index, "left" if index < element_i else "right", value
+            )
+            for value, index in zip(topk_values.tolist(), topk_indexs.tolist())
+        ]
+@dataclass(frozen=True)
+class NERNeuralContextRetrievalExample:
+    """A context retrieval example."""
+    #: text on which NER is performed
+    element: List[str]
+    #: context to assist during prediction
+    context: List[str]
+    #: context side (does the context comes from the left or the right of ``sent`` ?)
+    context_side: Literal["left", "right"]
+class NERNeuralContextRetrievalDataset(Dataset):
+    """"""
+    def __init__(
+        self,
+        examples: List[NERNeuralContextRetrievalExample],
+        tokenizer: BertTokenizerFast,
+    ) -> None:
+        self.examples = examples
+        self.tokenizer: BertTokenizerFast = tokenizer
+    def __len__(self) -> int:
+        return len(self.examples)
+    def __getitem__(self, index: int) -> BatchEncoding:
+        """Get a BatchEncoding representing example at index.
+        :param index: index of the example to retrieve
+        :return: a ``BatchEncoding``, with key ``'label'`` set.
+        """
+        example = self.examples[index]
+        tokens = example.context + ["[SEP]"] + example.element
+        batch: BatchEncoding = self.tokenizer(
+            tokens,
+            is_split_into_words=True,
+            truncation=True,
+            max_length=512,
+        )
+        return batch
+class NERNeuralContextRetriever(NERContextRetriever):
+    """
+    A neural context retriever as in Amalvy et al.  2024
+    """
+    def __init__(
+        self,
+        heuristic_context_selector: NERContextRetriever,
+        pretrained_model: Union[
+            str, BertForSequenceClassification
+        ] = "compnet-renard/bert-base-cased-NER-reranker",
+        k: int = 3,
+        batch_size: int = 1,
+        threshold: float = 0.0,
+        device_str: Literal["cuda", "cpu", "auto"] = "auto",
+    ) -> None:
+        """
+        :param pretrained_model: pretrained model name, used to
+            load a :class:`transformers.BertForSequenceClassification`
+        :param heuristic_context_selector: name of the context
+            selector to use as retrieval heuristic, from
+            ``context_selector_name_to_class``
+        :param heuristic_context_selector_kwargs: kwargs to pass the
+            heuristic context retriever at instantiation time
+        :param k: max number of sents to retrieve
+        :param batch_size: batch size used at inference
+        :param threshold:
+        :param device_str:
+        """
+        from transformers import BertForSequenceClassification, BertTokenizerFast
+        if isinstance(pretrained_model, str):
+            self.ctx_classifier = BertForSequenceClassification.from_pretrained(
+                pretrained_model
+            )  # type: ignore
+        else:
+            self.ctx_classifier = pretrained_model
+        self.ctx_classifier = cast(BertForSequenceClassification, self.ctx_classifier)
+        self.tokenizer = BertTokenizerFast.from_pretrained(
+            pretrained_model if isinstance(pretrained_model, str) else "bert-base-cased"
+        )
+        self.heuristic_context_selector = heuristic_context_selector
+        self.batch_size = batch_size
+        self.threshold = threshold
+        if device_str == "auto":
+            device_str = "cuda" if torch.cuda.is_available() else "cpu"
+        self.device = torch.device(device_str)
+        super().__init__(k)
+    def set_heuristic_k_(self, k: int):
+        self.heuristic_context_selector.k = k
+    def predict(self, examples: List[NERNeuralContextRetrievalExample]) -> torch.Tensor:
+        """
+        :param dataset: A list of :class:`ContextSelectionExample`
+        :return: A tensor of shape ``(len(dataset), 2)`` of class
+                 scores
+        """
+        dataset = NERNeuralContextRetrievalDataset(examples, self.tokenizer)
+        self.ctx_classifier = self.ctx_classifier.to(self.device)
+        data_collator = DataCollatorWithPadding(dataset.tokenizer)  # type: ignore
+        dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=False, collate_fn=data_collator)  # type: ignore
+        # inference using self.ctx_classifier
+        self.ctx_classifier = self.ctx_classifier.eval()
+        with torch.no_grad():
+            scores = torch.zeros((0,)).to(self.device)
+            for X in dataloader:
+                X = X.to(self.device)
+                # out.logits is of shape (batch_size, 2)
+                out = self.ctx_classifier(
+                    X["input_ids"],
+                    token_type_ids=X["token_type_ids"],
+                    attention_mask=X["attention_mask"],
+                )
+                # (batch_size, 2)
+                pred = torch.softmax(out.logits, dim=1)
+                scores = torch.cat([scores, pred], dim=0)
+        return scores
+    def compute_global_features(self, elements: List[List[str]]) -> dict:
+        features = self.heuristic_context_selector.compute_global_features(elements)
+        return {
+            "heuristic_matchs": [
+                self.heuristic_context_selector.retrieve(i, elements, **features)
+                for i in range(len(elements))
+            ]
+        }
+    def retrieve(
+        self,
+        element_i: int,
+        elements: List[List[str]],
+        heuristic_matchs: List[List[NERContextRetrievalMatch]],
+        **kwargs,
+    ) -> List[NERContextRetrievalMatch]:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.ctx_classifier = self.ctx_classifier.to(device)  # type: ignore
+        # no context retrieved by heuristic : nothing to do
+        if len(heuristic_matchs) == 0:
+            return []
+        element = elements[element_i]
+        matchs = heuristic_matchs[element_i]
+        # prepare datas for inference
+        ctx_dataset = [
+            NERNeuralContextRetrievalExample(element, m.element, m.side) for m in matchs
+        ]
+        # (len(dataset), 2)
+        scores = self.predict(ctx_dataset)
+        for i, m in enumerate(matchs):
+            m.score = float(scores[i, 1].item())
+        assert all([not m.score is None for m in matchs])
+        return [
+            m
+            for m in sorted(matchs, key=lambda m: -m.score)[: self.k]  # type: ignore
+            if m.score > self.threshold  # type: ignore
+        ]
+class NEREnsembleContextRetriever(NERContextRetriever):
+    """Combine several context retriever"""
+    def __init__(self, retrievers: List[NERContextRetriever], k: int) -> None:
+        self.retrievers = retrievers
+        super().__init__(k)
+    def compute_global_features(self, elements: List[List[str]]) -> dict:
+        features = {}
+        for retriever in self.retrievers:
+            for k, v in retriever.compute_global_features(elements).items():
+                if k in features:
+                    print(
+                        f"[warning] NEREnsembleContextRetriver: incompatible global feature '{k}' between multiple retrievers.",
+                        file=sys.stderr,
+                    )
+                features[k] = v
+        return features
+    def retrieve(
+        self, element_i: int, elements: List[List[str]], **kwargs
+    ) -> List[NERContextRetrievalMatch]:
+        all_matchs = set()
+        for retriever in self.retrievers:
+            matchs = retriever.retrieve(element_i, elements, **kwargs)
+            all_matchs = all_matchs.union(matchs)
+        if all(not m.score is None for m in all_matchs):
+            return sorted(all_matchs, key=lambda m: -m.score)[: self.k]  # type: ignore
+        return random.choices(list(all_matchs), k=self.k)

renard/pipeline/progress.py CHANGED Viewed

@@ -1,4 +1,6 @@
+from __future__ import annotations
 from typing import Iterable, Literal, Optional, TypeVar, Generator
+import sys
 from tqdm import tqdm
@@ -20,6 +22,10 @@ class ProgressReporter:
         """Update reporter current message."""
         pass
+    def get_subreporter(self) -> ProgressReporter:
+        """Get the subreporter corresponding to that reporter."""
+        raise NotImplementedError
 class NoopProgressReporter(ProgressReporter):
     def reset_(self):
@@ -28,6 +34,28 @@ class NoopProgressReporter(ProgressReporter):
     def update_progress_(self, added_progress: int):
         pass
+    def get_subreporter(self) -> ProgressReporter:
+        return NoopProgressReporter()
+class TQDMSubProgressReporter(ProgressReporter):
+    def __init__(self, reporter: TQDMProgressReporter) -> None:
+        super().__init__()
+        self.reporter = reporter
+    def start_(self, total: int):
+        super().start_(total)
+        self.progress = 0
+    def update_progress_(self, added_progress: int):
+        self.progress += added_progress
+        self.reporter.tqdm.set_postfix(step=f"({self.progress}/{self.total})")
+    def update_message_(self, message: str):
+        self.reporter.tqdm.set_postfix(
+            step=f"({self.progress}/{self.total})", message=message
+        )
 class TQDMProgressReporter(ProgressReporter):
     def start_(self, total: int):
@@ -40,6 +68,9 @@ class TQDMProgressReporter(ProgressReporter):
     def update_message_(self, message: str):
         self.tqdm.set_description_str(message)
+    def get_subreporter(self) -> ProgressReporter:
+        return TQDMSubProgressReporter(self)
 T = TypeVar("T")
@@ -62,5 +93,5 @@ def get_progress_reporter(name: Optional[Literal["tqdm"]]) -> ProgressReporter:
         return NoopProgressReporter()
     if name == "tqdm":
         return TQDMProgressReporter()
-    print(f"[warning] unknown progress reporter: {name}")
+    print(f"[warning] unknown progress reporter: {name}", file=sys.stderr)
     return NoopProgressReporter()

renard/pipeline/speaker_attribution.py CHANGED Viewed

@@ -49,13 +49,12 @@ class BertSpeakerDetector(PipelineStep):
         super().__init__()
-    def _pipeline_init_(self, lang: str, progress_reporter: ProgressReporter):
+    def _pipeline_init_(self, lang: str, **kwargs):
         from transformers import AutoTokenizer
-        super()._pipeline_init_(lang, progress_reporter)
+        super()._pipeline_init_(lang, **kwargs)
         if self.model is None:
             # the user supplied a huggingface ID: load model from the HUB
             if not self.huggingface_model_id is None:
                 self.model = SpeakerAttributionModel.from_pretrained(

renard-pipeline 0.4.2__py3-none-any.whl → 0.6.0__py3-none-any.whl

Potentially problematic release.

renard-pipeline 0.4.2py3-none-any.whl → 0.6.0py3-none-any.whl