PyPI - renard-pipeline - Versions diffs - 0.3.1__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

renard-pipeline 0.3.1py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of renard-pipeline might be problematic. Click here for more details.

Files changed (13) hide show

renard/ner_utils.py +304 -42
renard/pipeline/character_unification.py +10 -11
renard/pipeline/characters_extraction.py +1 -1
renard/pipeline/core.py +51 -34
renard/pipeline/graph_extraction.py +7 -10
renard/pipeline/ner.py +79 -58
renard/pipeline/stanford_corenlp.py +1 -1
renard/py.typed +0 -0
renard/utils.py +1 -52
{renard_pipeline-0.3.1.dist-info → renard_pipeline-0.4.1.dist-info}/METADATA +42 -4
{renard_pipeline-0.3.1.dist-info → renard_pipeline-0.4.1.dist-info}/RECORD +13 -12
{renard_pipeline-0.3.1.dist-info → renard_pipeline-0.4.1.dist-info}/LICENSE +0 -0
{renard_pipeline-0.3.1.dist-info → renard_pipeline-0.4.1.dist-info}/WHEEL +0 -0

renard/ner_utils.py CHANGED Viewed

@@ -1,9 +1,26 @@
-from typing import List, Optional, Union
+from __future__ import annotations
+from typing import TYPE_CHECKING, List, Optional, Union, Dict, Tuple
+import os, re
+import itertools as it
+import functools as ft
+from more_itertools import flatten
 import torch
 from torch.utils.data import Dataset
-from transformers import PreTrainedTokenizerFast
+from datasets import Dataset as HGDataset
+from datasets import Sequence, ClassLabel
+from transformers import (
+    AutoModelForTokenClassification,
+    AutoTokenizer,
+    PreTrainedTokenizerFast,
+    PreTrainedModel,
+    Trainer,
+    TrainingArguments,
+)
 from transformers.tokenization_utils_base import BatchEncoding
+if TYPE_CHECKING:
+    from renard.pipeline.ner import NEREntity
 class DataCollatorForTokenClassificationWithBatchEncoding:
     """Same as ``transformers.DataCollatorForTokenClassification``,
@@ -20,61 +37,306 @@ class DataCollatorForTokenClassificationWithBatchEncoding:
     ) -> None:
         self.tokenizer = tokenizer
         self.pad_to_multiple_of = pad_to_multiple_of
-        self.label_pad_token_id = -100
-    def __call__(self, features) -> Union[dict, BatchEncoding]:
-        label_name = "label" if "label" in features[0].keys() else "labels"
-        labels = (
-            [feature[label_name] for feature in features]
-            if label_name in features[0].keys()
-            else None
-        )
-        batch = self.tokenizer.pad(
-            features,
-            padding="longest",
-            pad_to_multiple_of=self.pad_to_multiple_of,
-            return_tensors="pt" if labels is None else None,
-        )
-        # keep encodings info dammit
+        self.pad_token_id = {"label": -100, "labels": -100}
+    def __call__(self, features: List[dict]) -> Union[dict, BatchEncoding]:
+        keys = features[0].keys()
+        sequence_len = max([len(f["input_ids"]) for f in features])
+        # We do the padding and collating manually instead of calling
+        # self.tokenizer.pad, because pad does not work on arbitrary
+        # features.
+        batch = BatchEncoding({})
+        for key in keys:
+            if self.tokenizer.padding_side == "right":
+                batch[key] = [
+                    f[key]
+                    + [self.pad_token_id.get(key, 0)] * (sequence_len - len(f[key]))
+                    for f in features
+                ]
+            else:
+                batch[key] = [
+                    [
+                        self.pad_token_id.get(key, 0) * (sequence_len - len(f[key]))
+                        + f[key]
+                        for f in features
+                    ]
+                ]
         batch._encodings = [f.encodings[0] for f in features]
-        if labels is None:
-            return batch
-        sequence_length = torch.tensor(batch["input_ids"]).shape[1]
-        padding_side = self.tokenizer.padding_side
-        if padding_side == "right":
-            batch[label_name] = [
-                list(label) + [self.label_pad_token_id] * (sequence_length - len(label))
-                for label in labels
-            ]
-        else:
-            batch[label_name] = [
-                [self.label_pad_token_id] * (sequence_length - len(label)) + list(label)
-                for label in labels
-            ]
+        for k, v in batch.items():
+            batch[k] = torch.tensor(v)
         return batch
 class NERDataset(Dataset):
+    """
+    :ivar _context_mask: for each element, a mask indicating which
+        tokens are part of the context (1 for context, 0 for text on
+        which to perform inference).  The mask allows to discard
+        predictions made for context at inference time, even though
+        the context can still be passed as input to the model.
+    """
     def __init__(
-        self, sentences: List[List[str]], tokenizer: PreTrainedTokenizerFast
+        self,
+        elements: List[List[str]],
+        tokenizer: PreTrainedTokenizerFast,
+        context_mask: Optional[List[List[int]]] = None,
     ) -> None:
-        self.sentences = sentences
+        self.elements = elements
+        if context_mask:
+            assert all(
+                [len(cm) == len(elt) for elt, cm in zip(self.elements, context_mask)]
+            )
+        self._context_mask = context_mask or [[0] * len(elt) for elt in self.elements]
         self.tokenizer = tokenizer
-    def __getitem__(self, index) -> BatchEncoding:
+    def __getitem__(self, index: Union[int, List[int]]) -> BatchEncoding:
+        element = self.elements[index]
         batch = self.tokenizer(
-            self.sentences[index],
-            return_tensors="pt",
-            padding=True,
+            element,
             truncation=True,
+            max_length=512,  # TODO
             is_split_into_words=True,
         )
-        for k in batch.keys():
-            batch[k] = batch[k][0]
+        batch["context_mask"] = [0] * len(batch["input_ids"])
+        elt_context_mask = self._context_mask[index]
+        for i in range(len(element)):
+            w2t = batch.word_to_tokens(0, i)
+            mask_value = elt_context_mask[i]
+            tokens_mask = [mask_value] * (w2t.end - w2t.start)
+            batch["context_mask"][w2t.start : w2t.end] = tokens_mask
         return batch
     def __len__(self) -> int:
-        return len(self.sentences)
+        return len(self.elements)
+def ner_entities(
+    tokens: List[str], bio_tags: List[str], resolve_inconsistencies: bool = True
+) -> List[NEREntity]:
+    """Extract NER entities from a list of BIO tags
+    :param tokens: a list of tokens
+    :param bio_tags: a list of BIO tags.  In particular, BIO tags
+        should be in the CoNLL-2002 form (such as 'B-PER I-PER')
+    :return: A list of ner entities, in apparition order
+    """
+    from renard.pipeline.ner import NEREntity
+    assert len(tokens) == len(bio_tags)
+    entities = []
+    current_tag: Optional[str] = None
+    current_tag_start_idx: Optional[int] = None
+    for i, tag in enumerate(bio_tags):
+        if not current_tag is None and not tag.startswith("I-"):
+            assert not current_tag_start_idx is None
+            entities.append(
+                NEREntity(
+                    tokens[current_tag_start_idx:i],
+                    current_tag_start_idx,
+                    i,
+                    current_tag,
+                )
+            )
+            current_tag = None
+            current_tag_start_idx = None
+        if tag.startswith("B-"):
+            current_tag = tag[2:]
+            current_tag_start_idx = i
+        elif tag.startswith("I-"):
+            if current_tag is None and resolve_inconsistencies:
+                current_tag = tag[2:]
+                current_tag_start_idx = i
+                continue
+    if not current_tag is None:
+        assert not current_tag_start_idx is None
+        entities.append(
+            NEREntity(
+                tokens[current_tag_start_idx : len(tokens)],
+                current_tag_start_idx,
+                len(bio_tags),
+                current_tag,
+            )
+        )
+    return entities
+def load_conll2002_bio(
+    path: str,
+    tag_conversion_map: Optional[Dict[str, str]] = None,
+    separator: str = "\t",
+    **kwargs,
+) -> Tuple[List[List[str]], List[str], List[NEREntity]]:
+    """Load a file under CoNLL2022 BIO format.  Sentences are expected
+    to be separated by end of lines.  Tags should be in the CoNLL-2002
+    format (such as 'B-PER I-PER') - If this is not the case, see the
+    ``tag_conversion_map`` argument.
+    :param path: path to the CoNLL-2002 formatted file
+    :param separator: separator between token and BIO tags
+    :param tag_conversion_map: conversion map for tags found in the
+        input file.  Example : ``{'B': 'B-PER', 'I': 'I-PER'}``
+    :param kwargs: additional kwargs for ``open`` (such as
+        ``encoding`` or ``newline``).
+    :return: ``(sentences, tokens, entities)``
+    """
+    tag_conversion_map = tag_conversion_map or {}
+    with open(os.path.expanduser(path), **kwargs) as f:
+        raw_data = f.read()
+    sents = []
+    sent_tokens = []
+    tags = []
+    for line in raw_data.split("\n"):
+        line = line.strip("\n")
+        if re.fullmatch(r"\s*", line):
+            if len(sent_tokens) == 0:
+                continue
+            sents.append(sent_tokens)
+            sent_tokens = []
+            continue
+        token, tag = line.split(separator)
+        sent_tokens.append(token)
+        tags.append(tag_conversion_map.get(tag, tag))
+    tokens = list(flatten(sents))
+    entities = ner_entities(tokens, tags)
+    return sents, list(flatten(sents)), entities
+def hgdataset_from_conll2002(
+    path: str,
+    tag_conversion_map: Optional[Dict[str, str]] = None,
+    separator: str = "\t",
+    **kwargs,
+) -> HGDataset:
+    """Load a CoNLL-2002 file as a Huggingface Dataset.
+    :param path: passed to :func:`.load_conll2002_bio`
+    :param tag_conversion_map: passed to :func:`load_conll2002_bio`
+    :param separator: passed to :func:`load_conll2002_bio`
+    :param kwargs: passed to :func:`load_conll2002_bio`
+    :return: a :class:`datasets.Dataset` with features 'tokens' and 'labels'.
+    """
+    sentences, tokens, entities = load_conll2002_bio(
+        path, tag_conversion_map, separator, **kwargs
+    )
+    # convert entities to labels
+    tags = ["O"] * len(tokens)
+    for entity in entities:
+        entity_len = entity.end_idx - entity.start_idx
+        tags[entity.start_idx : entity.end_idx] = [f"B-{entity.tag}"] + [
+            f"I-{entity.tag}"
+        ] * (entity_len - 1)
+    # cut into sentences
+    sent_ends = list(it.accumulate([len(s) for s in sentences]))
+    sent_starts = [0] + sent_ends[:-1]
+    sent_tags = [
+        tags[sent_start:sent_end]
+        for sent_start, sent_end in zip(sent_starts, sent_ends)
+    ]
+    dataset = HGDataset.from_dict({"tokens": sentences, "labels": sent_tags})
+    dataset = dataset.cast_column(
+        "labels", Sequence(ClassLabel(names=sorted(set(tags))))
+    )
+    return dataset
+def _tokenize_and_align_labels(
+    examples, tokenizer: PreTrainedTokenizerFast, label_all_tokens: bool = True
+):
+    """Adapted from https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/token_classification.ipynb#scrollTo=vc0BSBLIIrJQ
+    :param examples: an object with keys 'tokens' and 'labels'
+    """
+    tokenized_inputs = tokenizer(
+        examples["tokens"], truncation=True, is_split_into_words=True
+    )
+    labels = []
+    for i, label in enumerate(examples[f"labels"]):
+        word_ids = tokenized_inputs.word_ids(batch_index=i)
+        previous_word_idx = None
+        label_ids = []
+        for word_idx in word_ids:
+            # Special tokens have a word id that is None. We set the
+            # label to -100 so they are automatically ignored in the
+            # loss function.
+            if word_idx is None:
+                label_ids.append(-100)
+            # We set the label for the first token of each word.
+            elif word_idx != previous_word_idx:
+                label_ids.append(label[word_idx])
+            # For the other tokens in a word, we set the label to
+            # either the current label or -100, depending on the
+            # label_all_tokens flag.
+            else:
+                label_ids.append(label[word_idx] if label_all_tokens else -100)
+            previous_word_idx = word_idx
+        labels.append(label_ids)
+    tokenized_inputs["labels"] = labels
+    return tokenized_inputs
+def train_ner_model(
+    hg_id: str,
+    dataset: HGDataset,
+    targs: TrainingArguments,
+) -> PreTrainedModel:
+    from transformers import DataCollatorForTokenClassification
+    # BERT tokenizer splits tokens into subtokens. The
+    # tokenize_and_align_labels function correctly aligns labels and
+    # subtokens.
+    tokenizer = AutoTokenizer.from_pretrained(hg_id)
+    dataset = dataset.map(
+        ft.partial(_tokenize_and_align_labels, tokenizer=tokenizer), batched=True
+    )
+    dataset = dataset.train_test_split(test_size=0.1)
+    label_lst = dataset["train"].features["labels"].feature.names
+    model = AutoModelForTokenClassification.from_pretrained(
+        hg_id,
+        num_labels=len(label_lst),
+        id2label={i: label for i, label in enumerate(label_lst)},
+        label2id={label: i for i, label in enumerate(label_lst)},
+    )
+    trainer = Trainer(
+        model,
+        targs,
+        train_dataset=dataset["train"],
+        eval_dataset=dataset["test"],
+        # data_collator=DataCollatorForTokenClassificationWithBatchEncoding(tokenizer),
+        data_collator=DataCollatorForTokenClassification(tokenizer),
+        tokenizer=tokenizer,
+    )
+    trainer.train()
+    return model

renard/pipeline/character_unification.py CHANGED Viewed

@@ -54,8 +54,8 @@ def _assign_coreference_mentions(
     :param corefs:
     """
-    char_mentions: Dict[Character, List[Mention]] = {
-        character: character.mentions for character in characters
+    char_mentions: Dict[Character, Set[Mention]] = {
+        character: set(character.mentions) for character in characters
     }
     # we assign each chain to the character with highest name
@@ -80,12 +80,12 @@ def _assign_coreference_mentions(
         # assign the chain to the character with the most occurences
         for mention in chain:
-            # TODO: complexity
             if not mention in char_mentions[best_character]:
-                char_mentions[best_character].append(mention)
+                char_mentions[best_character].add(mention)
     return [
-        Character(c.names, mentions, c.gender) for c, mentions in char_mentions.items()
+        Character(c.names, sorted(mentions, key=lambda m: m.start_idx), c.gender)
+        for c, mentions in char_mentions.items()
     ]
@@ -209,7 +209,6 @@ class GraphRulesCharacterUnifier(PipelineStep):
         # * link nodes based on several rules
         for name1, name2 in combinations(G.nodes(), 2):
             # is one name a known hypocorism of the other ? (also
             # checks if both names are the same)
             if self.hypocorism_gazetteer.are_related(name1, name2):
@@ -263,7 +262,6 @@ class GraphRulesCharacterUnifier(PipelineStep):
                 pass
         for name1, name2 in combinations(G.nodes(), 2):
             # check if characters have the same last name but a
             # different first name.
             human_name1 = HumanName(name1, constants=hname_constants)
@@ -333,10 +331,11 @@ class GraphRulesCharacterUnifier(PipelineStep):
         self, name1: str, name2: str, hname_constants: Constants
     ) -> bool:
         """Check if two names are related after removing their titles"""
-        local_constants = copy.deepcopy(hname_constants)
-        local_constants.string_format = "{first} {middle} {last}"
-        raw_name1 = HumanName(name1, constants=local_constants).full_name
-        raw_name2 = HumanName(name2, constants=local_constants).full_name
+        old_string_format = hname_constants.string_format
+        hname_constants.string_format = "{first} {middle} {last}"
+        raw_name1 = HumanName(name1, constants=hname_constants).full_name
+        raw_name2 = HumanName(name2, constants=hname_constants).full_name
+        hname_constants.string_format = old_string_format
         if raw_name1 == "" or raw_name2 == "":
             return False

renard/pipeline/characters_extraction.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import renard.pipeline.character_unification as cu
 print(
-    "[warning] the characters_extraction module is deprecated. Use character_unfication instead."
+    "[warning] the characters_extraction module is deprecated. Use character_unification instead."
 )
 Character = cu.Character

renard/pipeline/core.py CHANGED Viewed

@@ -16,7 +16,7 @@ from typing import (
     Type,
     TYPE_CHECKING,
 )
-import os
+import os, sys
 import networkx as nx
 from networkx.readwrite.gexf import GEXFWriter
@@ -50,6 +50,13 @@ class Mention:
         self_dict["end_idx"] = self.end_idx + shift
         return self.__class__(**self_dict)
+    def __eq__(self, other: Mention) -> bool:
+        return (
+            self.tokens == other.tokens
+            and self.start_idx == other.start_idx
+            and self.end_idx == other.end_idx
+        )
     def __hash__(self) -> int:
         return hash(tuple(self.tokens) + (self.start_idx, self.end_idx))
@@ -171,8 +178,18 @@ class PipelineState:
     #: detected characters
     characters: Optional[List[Character]] = None
-    #: characters graph
-    characters_graph: Optional[Union[List[nx.Graph], nx.Graph]] = None
+    #: character network (or list of network in the case of a dynamic
+    #: network)
+    character_network: Optional[Union[List[nx.Graph], nx.Graph]] = None
+    def get_characters_graph(self) -> Optional[Union[List[nx.Graph], nx.Graph]]:
+        print(
+            "[warning] the characters_graph attribute is deprecated, use character_network instead",
+            file=sys.stderr,
+        )
+        return self.character_network
+    characters_graph = property(get_characters_graph)
     def get_character(
         self, name: str, partial_match: bool = True
@@ -228,8 +245,8 @@ class PipelineState:
             for more details
         """
         path = os.path.expanduser(path)
-        if isinstance(self.characters_graph, list):
-            G = dynamic_graph_to_gephi_graph(self.characters_graph)
+        if isinstance(self.character_network, list):
+            G = dynamic_graph_to_gephi_graph(self.character_network)
             G = graph_with_names(G, name_style)
             # HACK: networkx cannot set a dynamic "weight" attribute
             # in gexf since "weight" has a specific meaning in
@@ -251,7 +268,7 @@ class PipelineState:
                 attvalue.set("for", "weight")
             writer.write(path)
         else:
-            G = graph_with_names(self.characters_graph, name_style)
+            G = graph_with_names(self.character_network, name_style)
             nx.write_gexf(G, path)
     def plot_graphs_to_dir(
@@ -280,23 +297,23 @@ class PipelineState:
         """
         import matplotlib.pyplot as plt
-        assert not self.characters_graph is None
-        if isinstance(self.characters_graph, nx.Graph):
+        assert not self.character_network is None
+        if isinstance(self.character_network, nx.Graph):
             raise ValueError("this function is supposed to be used on a dynamic graph")
         directory = directory.rstrip("/")
         directory = os.path.expanduser(directory)
         os.makedirs(directory, exist_ok=True)
-        graphs = self.characters_graph
+        graphs = self.character_network
         if cumulative:
-            graphs = cumulative_graph(self.characters_graph)
+            graphs = cumulative_graph(self.character_network)
         if stable_layout:
             layout_graph = (
                 graphs[-1]
                 if cumulative
-                else cumulative_graph(self.characters_graph)[-1]
+                else cumulative_graph(self.character_network)[-1]
             )
             layout = layout_nx_graph_reasonably(layout_graph)
@@ -330,13 +347,13 @@ class PipelineState:
         """
         import matplotlib.pyplot as plt
-        assert not self.characters_graph is None
-        if isinstance(self.characters_graph, list):
+        assert not self.character_network is None
+        if isinstance(self.character_network, list):
             raise ValueError("this function is supposed to be used on a static graph")
         if not layout is None:
-            layout = layout_with_names(self.characters_graph, layout, name_style)
-        G = graph_with_names(self.characters_graph, name_style=name_style)
+            layout = layout_with_names(self.character_network, layout, name_style)
+        G = graph_with_names(self.character_network, name_style=name_style)
         if fig is None:
             # default values for a sufficiently sized graph
             fig = plt.gcf()
@@ -359,7 +376,7 @@ class PipelineState:
         stable_layout: bool = False,
         layout: Optional[CharactersGraphLayout] = None,
     ):
-        """Plot ``self.characters_graph`` using reasonable default
+        """Plot ``self.character_network`` using reasonable default
         parameters
         .. note::
@@ -372,13 +389,13 @@ class PipelineState:
             details
         :param fig: if specified, this matplotlib figure will be used
             for plotting
-        :param cumulative: if ``True`` and ``self.characters_graph``
+        :param cumulative: if ``True`` and ``self.character_network``
             is dynamic, plot a cumulative graph instead of a
             sequential one
-        :param graph_start_idx: When ``self.characters_graph`` is
+        :param graph_start_idx: When ``self.character_network`` is
             dynamic, index of the first graph to plot, starting at 1
             (not 0, since the graph slider starts at 1)
-        :param stable_layout: if ``self.characters_graph`` is dynamic
+        :param stable_layout: if ``self.character_network`` is dynamic
             and this parameter is ``True``, characters will keep the
             same position in space at each timestep.  Characters'
             positions are based on the final cumulative graph layout.
@@ -387,13 +404,13 @@ class PipelineState:
         import matplotlib.pyplot as plt
         from matplotlib.widgets import Slider
-        assert not self.characters_graph is None
+        assert not self.character_network is None
-        # self.characters_graph is a static graph
-        if isinstance(self.characters_graph, nx.Graph):
+        # self.character_network is a static graph
+        if isinstance(self.character_network, nx.Graph):
             if not layout is None:
-                layout = layout_with_names(self.characters_graph, layout, name_style)
-            G = graph_with_names(self.characters_graph, name_style)
+                layout = layout_with_names(self.character_network, layout, name_style)
+            G = graph_with_names(self.character_network, name_style)
             if fig is None:
                 # default value for a sufficiently sized graph
                 fig = plt.gcf()
@@ -404,9 +421,9 @@ class PipelineState:
             plot_nx_graph_reasonably(G, ax=ax, layout=layout)
             return
-        if not isinstance(self.characters_graph, list):
+        if not isinstance(self.character_network, list):
             raise TypeError
-        # self.characters_graph is a list: plot a dynamic graph
+        # self.character_network is a list: plot a dynamic graph
         if fig is None:
             fig, ax = plt.subplots()
@@ -417,18 +434,18 @@ class PipelineState:
             ax = fig.add_subplot(111)
         assert not fig is None
-        cumulative_characters_graphs = cumulative_graph(self.characters_graph)
+        cumulative_character_networks = cumulative_graph(self.character_network)
         if stable_layout:
-            layout = layout_nx_graph_reasonably(cumulative_characters_graphs[-1])
+            layout = layout_nx_graph_reasonably(cumulative_character_networks[-1])
         def update(slider_value):
-            assert isinstance(self.characters_graph, list)
+            assert isinstance(self.character_network, list)
-            characters_graphs = self.characters_graph
+            character_networks = self.character_network
             if cumulative:
-                characters_graphs = cumulative_characters_graphs
+                character_networks = cumulative_character_networks
-            G = characters_graphs[int(slider_value) - 1]
+            G = character_networks[int(slider_value) - 1]
             local_layout = layout
             if not local_layout is None:
@@ -447,8 +464,8 @@ class PipelineState:
             ax=slider_ax,
             label="Graph",
             valmin=1,
-            valmax=len(self.characters_graph),
-            valstep=[i + 1 for i in range(len(self.characters_graph))],
+            valmax=len(self.character_network),
+            valstep=[i + 1 for i in range(len(self.character_network))],
         )
         fig.slider.on_changed(update)  # type: ignore
         fig.slider.set_val(graph_start_idx)  # type: ignore

renard/pipeline/graph_extraction.py CHANGED Viewed

@@ -158,7 +158,7 @@ class CoOccurrencesGraphExtractor(PipelineStep):
         :param characters:
-        :return: a ``dict`` with key ``'characters_graph'`` and a
+        :return: a ``dict`` with key ``'character_network'`` and a
             :class:`nx.Graph` or a list of :class:`nx.Graph` as
             value.
         """
@@ -170,7 +170,7 @@ class CoOccurrencesGraphExtractor(PipelineStep):
         if self.dynamic:
             return {
-                "characters_graph": self._extract_dynamic_graph(
+                "character_network": self._extract_dynamic_graph(
                     mentions,
                     self.dynamic_window,
                     self.dynamic_overlap,
@@ -180,7 +180,7 @@ class CoOccurrencesGraphExtractor(PipelineStep):
                 )
             }
         return {
-            "characters_graph": self._extract_graph(
+            "character_network": self._extract_graph(
                 mentions, sentences, sentences_polarities
             )
         }
@@ -419,7 +419,7 @@ class CoOccurrencesGraphExtractor(PipelineStep):
         return needs
     def production(self) -> Set[str]:
-        return {"characters_graph"}
+        return {"character_network"}
     def optional_needs(self) -> Set[str]:
         return {"sentences_polarities"}
@@ -475,20 +475,17 @@ class ConversationalGraphExtractor(PipelineStep):
         characters: Set[Character],
         **kwargs,
     ) -> Dict[str, Any]:
         G = nx.Graph()
         for character in characters:
             G.add_node(character)
         for i, (quote_1, speaker_1) in enumerate(zip(quotes, speakers)):
             # no speaker prediction: ignore
             if speaker_1 is None:
                 continue
             # check ahead for co-occurences
             for quote_2, speaker_2 in zip(quotes[i + 1 :], speakers[i + 1 :]):
                 # no speaker prediction: ignore
                 if speaker_2 is None:
                     continue
@@ -507,12 +504,12 @@ class ConversationalGraphExtractor(PipelineStep):
                     G.add_edge(speaker_1, speaker_2, weight=0)
                 G.edges[speaker_1, speaker_2]["weight"] += 1
-        return {"characters_graph": G}
+        return {"character_network": G}
     def needs(self) -> Set[str]:
         """sentences, quotes, speakers, characters"""
         return {"sentences", "quotes", "speakers", "characters"}
     def production(self) -> Set[str]:
-        """characters_graph"""
-        return {"characters_graph"}
+        """character_network"""
+        return {"character_network"}

renard/pipeline/ner.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from __future__ import annotations
+import random, itertools
 from typing import TYPE_CHECKING, List, Dict, Any, Set, Tuple, Optional, Union, Literal
 from dataclasses import dataclass
 import torch
@@ -10,6 +11,7 @@ from renard.ner_utils import (
 )
 from renard.pipeline.core import PipelineStep, Mention
 from renard.pipeline.progress import ProgressReporter
+from renard.ner_utils import ner_entities
 if TYPE_CHECKING:
     from transformers.tokenization_utils_base import BatchEncoding
@@ -32,60 +34,8 @@ class NEREntity(Mention):
         """
         return super().shifted(shift)  # type: ignore
-def ner_entities(
-    tokens: List[str], bio_tags: List[str], resolve_inconsistencies: bool = True
-) -> List[NEREntity]:
-    """Extract NER entities from a list of BIO tags
-    :param tokens: a list of tokens
-    :param bio_tags: a list of BIO tags.  In particular, BIO tags
-        should be in the CoNLL-2002 form (such as 'B-PER I-PER')
-    :return: A list of ner entities, in apparition order
-    """
-    assert len(tokens) == len(bio_tags)
-    entities = []
-    current_tag: Optional[str] = None
-    current_tag_start_idx: Optional[int] = None
-    for i, tag in enumerate(bio_tags):
-        if not current_tag is None and not tag.startswith("I-"):
-            assert not current_tag_start_idx is None
-            entities.append(
-                NEREntity(
-                    tokens[current_tag_start_idx:i],
-                    current_tag_start_idx,
-                    i,
-                    current_tag,
-                )
-            )
-            current_tag = None
-            current_tag_start_idx = None
-        if tag.startswith("B-"):
-            current_tag = tag[2:]
-            current_tag_start_idx = i
-        elif tag.startswith("I-"):
-            if current_tag is None and resolve_inconsistencies:
-                current_tag = tag[2:]
-                current_tag_start_idx = i
-                continue
-    if not current_tag is None:
-        assert not current_tag_start_idx is None
-        entities.append(
-            NEREntity(
-                tokens[current_tag_start_idx : len(tokens)],
-                current_tag_start_idx,
-                len(bio_tags),
-                current_tag,
-            )
-        )
-    return entities
+    def __hash__(self) -> int:
+        return hash(tuple(self.tokens) + (self.start_idx, self.end_idx, self.tag))
 def score_ner(
@@ -151,11 +101,69 @@ class NLTKNamedEntityRecognizer(PipelineStep):
         return {"entities"}
+class NERContextRetriever:
+    def __call__(self, dataset: NERDataset) -> NERDataset:
+        raise NotImplementedError
+class NERSamenounContextRetriever(NERContextRetriever):
+    """
+    Retrieve relevant context using the samenoun strategy as in
+    Amalvy et al.  2023.
+    """
+    def __init__(self, k: int) -> None:
+        """
+        :param k: the number of sentences to retrieve
+        """
+        self.k = k
+    def __call__(self, dataset: NERDataset) -> NERDataset:
+        import nltk
+        # NOTE: POS tagging is not incorporated in the pipeline yet,
+        # so we manually compute it here.
+        elements_names = [
+            {t[0] for t in nltk.pos_tag(element) if t[1].startswith("NN")}
+            for element in dataset.elements
+        ]
+        elements_with_context = []
+        for elt_i, elt in enumerate(dataset.elements):
+            retrieved_elts = [
+                other_elt
+                for other_elt_i, other_elt in enumerate(dataset.elements)
+                if not other_elt_i == elt_i
+                and len(elements_names[elt_i].intersection(elements_names[other_elt_i]))
+                > 0
+            ]
+            retrieved_elts = random.sample(
+                retrieved_elts, k=min(self.k, len(retrieved_elts))
+            )
+            elements_with_context.append(
+                (
+                    elt,
+                    [dataset.tokenizer.sep_token]
+                    + list(itertools.chain.from_iterable(retrieved_elts)),
+                )
+            )
+        return NERDataset(
+            [element + context for element, context in elements_with_context],
+            dataset.tokenizer,
+            [
+                [0] * len(element) + [1] * len(context)
+                for element, context in elements_with_context
+            ],
+        )
 class BertNamedEntityRecognizer(PipelineStep):
     """An entity recognizer based on BERT"""
     LANG_TO_MODELS = {
-        "fra": "Davlan/bert-base-multilingual-cased-ner-hrl",
+        "fra": "compnet-renard/camembert-base-literary-NER",
         "eng": "compnet-renard/bert-base-cased-literary-NER",
     }
@@ -165,6 +173,7 @@ class BertNamedEntityRecognizer(PipelineStep):
         batch_size: int = 4,
         device: Literal["cpu", "cuda", "auto"] = "auto",
         tokenizer: Optional[PreTrainedTokenizerFast] = None,
+        context_retriever: Optional[NERContextRetriever] = None,
     ):
         """
         :param model: Either:
@@ -181,6 +190,9 @@ class BertNamedEntityRecognizer(PipelineStep):
         :param batch_size: batch size at inference
         :param device: computation device
         :param tokenizer: a custom tokenizer
+        :param context_retriever: if specified, use
+            ``context_retriever`` to retrieve relevant global context
+            at run time, generally trading runtme for NER performance.
         """
         if isinstance(model, str):
             self.huggingface_model_id = model
@@ -198,6 +210,8 @@ class BertNamedEntityRecognizer(PipelineStep):
         else:
             self.device = torch.device(device)
+        self.context_retriever = context_retriever
         super().__init__()
     def _pipeline_init_(self, lang: str, progress_reporter: ProgressReporter):
@@ -208,7 +222,6 @@ class BertNamedEntityRecognizer(PipelineStep):
         # init model if needed (this happens if the user did not pass
         # the instance of a model)
         if self.model is None:
             # the user supplied a huggingface ID: load model from the HUB
             if not self.huggingface_model_id is None:
                 self.model = AutoModelForTokenClassification.from_pretrained(
@@ -251,6 +264,10 @@ class BertNamedEntityRecognizer(PipelineStep):
         self.model = self.model.to(self.device)
         dataset = NERDataset(sentences, self.tokenizer)
+        if not self.context_retriever is None:
+            dataset = self.context_retriever(dataset)
         dataloader = DataLoader(
             dataset,
             batch_size=self.batch_size,
@@ -262,7 +279,6 @@ class BertNamedEntityRecognizer(PipelineStep):
         labels = []
         with torch.no_grad():
             for batch_i, batch in enumerate(self._progress_(dataloader)):
                 out = self.model(
                     batch["input_ids"].to(self.device),
@@ -277,7 +293,9 @@ class BertNamedEntityRecognizer(PipelineStep):
                         for tens in batch_classes_tens[i]
                     ]
                     sent_tokens = sentences[self.batch_size * batch_i + i]
-                    sent_labels = self.batch_labels(batch, i, wp_labels, sent_tokens)
+                    sent_labels = self.batch_labels(
+                        batch, i, wp_labels, sent_tokens, batch["context_mask"]
+                    )
                     labels += sent_labels
         return {"entities": ner_entities(tokens, labels)}
@@ -288,6 +306,7 @@ class BertNamedEntityRecognizer(PipelineStep):
         batch_i: int,
         wp_labels: List[str],
         tokens: List[str],
+        context_mask: torch.Tensor,
     ) -> List[str]:
         """Align labels to tokens rather than wordpiece tokens.
@@ -299,6 +318,8 @@ class BertNamedEntityRecognizer(PipelineStep):
         batch_labels = ["O"] * len(tokens)
         for wplabel_j, wp_label in enumerate(wp_labels):
+            if context_mask[batch_i][wplabel_j] == 1:
+                continue
             token_i = batchs.token_to_word(batch_i, wplabel_j)
             if token_i is None:
                 continue

renard/pipeline/stanford_corenlp.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import os
 from typing import List, Optional, Set, Dict, Any, Literal
-from renard.pipeline.ner import ner_entities
+from renard.ner_utils import ner_entities
 import stanza
 from stanza.protobuf import CoreNLP_pb2

renard/py.typed ADDED Viewed

File without changes

renard/utils.py CHANGED Viewed

@@ -1,12 +1,7 @@
-from typing import List, Tuple, TypeVar, Collection, Iterable, Optional, Dict, cast
-import re, os
-from more_itertools import flatten
+from typing import List, Tuple, TypeVar, Collection, Iterable, cast
 from more_itertools.more import windowed
 import torch
-from renard.pipeline.ner import NEREntity, ner_entities
 T = TypeVar("T")
@@ -81,49 +76,3 @@ def search_pattern(seq: Iterable[R], pattern: List[R]) -> List[int]:
         if list(subseq) == pattern:
             start_indices.append(subseq_i)
     return start_indices
-def load_conll2002_bio(
-    path: str,
-    tag_conversion_map: Optional[Dict[str, str]] = None,
-    separator: str = "\t",
-    **kwargs
-) -> Tuple[List[List[str]], List[str], List[NEREntity]]:
-    """Load a file under CoNLL2022 BIO format.  Sentences are expected
-    to be separated by end of lines.  Tags should be in the CoNLL-2002
-    format (such as 'B-PER I-PER') - If this is not the case, see the
-    ``tag_conversion_map`` argument.
-    :param path: path to the CoNLL-2002 formatted file
-    :param separator: separator between token and BIO tags
-    :param tag_conversion_map: conversion map for tags found in the
-        input file.  Example : ``{'B': 'B-PER', 'I': 'I-PER'}``
-    :param kwargs: additional kwargs for ``open`` (such as
-        ``encoding`` or ``newline``).
-    :return: ``(sentences, tokens, entities)``
-    """
-    if tag_conversion_map is None:
-        tag_conversion_map = {}
-    with open(os.path.expanduser(path), **kwargs) as f:
-        raw_data = f.read()
-    sents = []
-    sent_tokens = []
-    tags = []
-    for line in raw_data.split("\n"):
-        line = line.strip("\n")
-        if re.fullmatch(r"\s*", line):
-            sents.append(sent_tokens)
-            sent_tokens = []
-            continue
-        token, tag = line.split(separator)
-        sent_tokens.append(token)
-        tags.append(tag_conversion_map.get(tag, tag))
-    tokens = list(flatten(sents))
-    entities = ner_entities(tokens, tags)
-    return sents, list(flatten(sents)), entities

{renard_pipeline-0.3.1.dist-info → renard_pipeline-0.4.1.dist-info}/METADATA RENAMED Viewed

@@ -1,7 +1,8 @@
 Metadata-Version: 2.1
 Name: renard-pipeline
-Version: 0.3.1
+Version: 0.4.1
 Summary: Relationships Extraction from NARrative Documents
+Home-page: https://github.com/CompNet/Renard
 License: GPL-3.0-only
 Author: Arthur Amalvy
 Author-email: arthur.amalvy@univ-avignon.fr
@@ -14,6 +15,7 @@ Classifier: Programming Language :: Python :: 3.10
 Provides-Extra: spacy
 Provides-Extra: stanza
 Requires-Dist: coreferee (>=1.4.0,<2.0.0) ; extra == "spacy"
+Requires-Dist: datasets (>=2.16.1,<3.0.0)
 Requires-Dist: grimbert (>=0.1.0,<0.2.0)
 Requires-Dist: matplotlib (>=3.5.3,<4.0.0)
 Requires-Dist: more-itertools (>=10.1.0,<11.0.0)
@@ -26,15 +28,19 @@ Requires-Dist: seqeval (==1.2.2)
 Requires-Dist: spacy (>=3.5.0,<4.0.0) ; extra == "spacy"
 Requires-Dist: spacy-transformers (>=1.2.1,<2.0.0) ; extra == "spacy"
 Requires-Dist: stanza (>=1.3.0,<2.0.0) ; extra == "stanza"
-Requires-Dist: tibert (>=0.2.4,<0.3.0)
+Requires-Dist: tibert (>=0.3.0,<0.4.0)
 Requires-Dist: torch (>=2.0.0,!=2.0.1)
 Requires-Dist: tqdm (>=4.62.3,<5.0.0)
 Requires-Dist: transformers (>=4.36.0,<5.0.0)
+Project-URL: Documentation, https://compnet.github.io/Renard/
+Project-URL: Repository, https://github.com/CompNet/Renard
 Description-Content-Type: text/markdown
 # Renard
-Relationships Extraction from NARrative Documents
+Renard (Relationships Extraction from NARrative Documents) is a library for creating and using custom character networks extraction pipelines. Renard can extract dynamic as well as static character networks.
+![Character network extracted from "Pride and Prejudice"](./docs/pp_white_bg.svg)
 # Installation
@@ -43,6 +49,8 @@ You can install the latest version using pip:
 > pip install renard-pipeline
+Currently, Renard supports Python 3.8, 3.9 and 3.10.
 # Documentation
@@ -53,7 +61,32 @@ If you need local documentation, it can be generated using `Sphinx`. From the `d
 # Tutorial
-`renard_tutorial.py` is a tutorial in the `jupytext` format. You can open it as a notebook in Jupyter Notebook (or export it as a notebook with `jupytext --to ipynb renard-tutorial.py`).
+Renard's central concept is the `Pipeline`.A `Pipeline` is a list of `PipelineStep` that are run sequentially in order to extract a character graph from a document. Here is a simple example:
+```python
+from renard.pipeline import Pipeline
+from renard.pipeline.tokenization import NLTKTokenizer
+from renard.pipeline.ner import NLTKNamedEntityRecognizer
+from renard.pipeline.character_unification import GraphRulesCharacterUnifier
+from renard.pipeline.graph_extraction import CoOccurrencesGraphExtractor
+with open("./my_doc.txt") as f:
+	text = f.read()
+pipeline = Pipeline(
+	[
+		NLTKTokenizer(),
+		NLTKNamedEntityRecognizer(),
+		GraphRulesCharacterUnifier(min_appearance=10),
+		CoOccurrencesGraphExtractor(co_occurrences_dist=25)
+	]
+)
+out = pipeline(text)
+```
+For more information, see `renard_tutorial.py`, which is a tutorial in the `jupytext` format. You can open it as a notebook in Jupyter Notebook (or export it as a notebook with `jupytext --to ipynb renard-tutorial.py`).
 # Running tests
@@ -64,3 +97,8 @@ If you need local documentation, it can be generated using `Sphinx`. From the `d
 Expensive tests are disabled by default. These can be run by setting the environment variable `RENARD_TEST_ALL` to `1`.
+# Contributing
+see [the "Contributing" section of the documentation](https://compnet.github.io/Renard/contributing.html).

{renard_pipeline-0.3.1.dist-info → renard_pipeline-0.4.1.dist-info}/RECORD RENAMED Viewed

@@ -1,24 +1,25 @@
 renard/gender.py,sha256=HDtJQKOqIkV8F-Mxva95XFXWJoKRKckQ3fc93OBM6sw,102
 renard/graph_utils.py,sha256=5jwky9JgJ-WMVHfeaiXkAAQwEfhR2BFSrWhck1Qmpgo,5812
-renard/ner_utils.py,sha256=heOEPIyECrttYc555u88_S_PuhGzJepjOYH--Y4wRCI,2577
+renard/ner_utils.py,sha256=jN1AQkaV0Kx-Bc0oc3SYBEmSUuKPBbzXqByOlaqH62k,11263
 renard/nltk_utils.py,sha256=mUJiwMrEDZV4Fla7WuMR-hA_OC2ZIwSXgW_0Ew18VSo,977
 renard/pipeline/__init__.py,sha256=8Yim2mmny8YGvM7N5-na5zK-C9UDxUb77K9ml-VirUA,35
-renard/pipeline/character_unification.py,sha256=J4wa_k5SOfTz7T3_VpocoaT5b2z-HNMLapzA3bmgC_Y,14748
-renard/pipeline/characters_extraction.py,sha256=SjIy-irAaQpXBDxoWPczTmzed3Uv7pivz0WigkaSLAM,362
-renard/pipeline/core.py,sha256=fNZBFmkELUFzyL38XLOBsE4kjSP0LRERVtJhigVHloo,21928
+renard/pipeline/character_unification.py,sha256=GcnC8UYqn1RBOGVhYS9LVcTNqpxm9YoT-lPsE3vodek,14818
+renard/pipeline/characters_extraction.py,sha256=NzF8H9X19diW6rqwS5ERrRku7rFueO3S077H5C6kb7I,363
+renard/pipeline/core.py,sha256=luKNUTCDtZfwKzxVIaImyIMwFFvIknfT1LdQtongj24,22570
 renard/pipeline/corefs/__init__.py,sha256=9c9AaXBcRrDBf1jhTtJ7DyjOJhX_Zej3FjlcGak7MK8,44
 renard/pipeline/corefs/corefs.py,sha256=nzYT6S9ify3FlgGB3FSDpAhs2UQYgW9c3CL2GRYzTms,11508
-renard/pipeline/graph_extraction.py,sha256=dyK-QYgwxRxqQrhW83ikCkWf4_sUUPy5gsn8AinxX5E,19464
-renard/pipeline/ner.py,sha256=4rmemtrkLdhM-A86yxB-0BtJMbfwlBLRDfBBcFVqVTw,10332
+renard/pipeline/graph_extraction.py,sha256=n0T_nzNGiwE9bDubpPknHe7bbDhJ4ndnqmoMmyfbeWg,19468
+renard/pipeline/ner.py,sha256=5zqZlEjhO__0iuRQAN9rvhCbcd9QmNCcH9_NP_BaTbc,11261
 renard/pipeline/preconfigured.py,sha256=j4-0OUZrmtC8rQfwGWEAAGNxc8-4hlY7N823Uami5lk,5392
 renard/pipeline/preprocessing.py,sha256=OsdsYzmRweAiQV_CtP7uiz--OGogZtQlsdR8XX5DCk0,952
 renard/pipeline/progress.py,sha256=VQsIxTuz0QQnepXPevHhMU-dHXMa1RWsjmMfBgoWdiY,1684
 renard/pipeline/quote_detection.py,sha256=FyldJhynIT843fB7rwVtHmDZJqTKkjGml6qTLjsIhMA,2045
 renard/pipeline/sentiment_analysis.py,sha256=76MPin4L1-vSswJe5yGrbCSSDim1LYxSEgNj_BdQDvk,1464
 renard/pipeline/speaker_attribution.py,sha256=qCY-Z1haDDgZy8L4k8pAc6xIcSFmtcuuESu631QxRUY,4366
-renard/pipeline/stanford_corenlp.py,sha256=0cUysmBbA_D3b9D59tbNEDv_c06ZTSaohgjZrCBJWkk,8192
+renard/pipeline/stanford_corenlp.py,sha256=14b6Ee6oPz1EL-bNRT688aNxVTk_Jwa_vJ20FiBODC4,8189
 renard/pipeline/tokenization.py,sha256=RllOxSjaV_Sdu3CH8vKIbceNj3Noeey31mKircxWoyM,1806
 renard/plot_utils.py,sha256=bmIBybleFJ-YiVPLPPWYW8x1UHpkuXTE7O9lQlRiWrk,2133
+renard/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 renard/resources/hypocorisms/__init__.py,sha256=vlsY9PqxQCIpijxm79Y0KYh2c0S4S1pgrC9w-AUQGvE,55
 renard/resources/hypocorisms/datas/License.txt,sha256=tAkwu8-AdEyGxGoSvJ2gVmQdcicWw3j1ZZueVV74M-E,11357
 renard/resources/hypocorisms/datas/hypocorisms.csv,sha256=CKTo7A5i14NzN6JRBz7U2NJnxrEo8VOlmmdhzEZnqlI,21470
@@ -27,8 +28,8 @@ renard/resources/pronouns/__init__.py,sha256=62h0zuXp8kCToTLTyg8D8rJ-MXQpT8Vyc6m
 renard/resources/pronouns/pronouns.py,sha256=YJ8hM6H8QHrF2Xx6O5blqc-Sqe1D1YFL0sRdqO_rroE,817
 renard/resources/titles/__init__.py,sha256=Jcg4B7stsWiAaXbFgNl_L3ICtCQmFe9bo3YjdkVL50w,45
 renard/resources/titles/titles.py,sha256=GsFccVJuTkgDWiAqWZpFd2R9pGvFKQZBOk4RWWuWDkw,968
-renard/utils.py,sha256=jG_dLFxlvMoOL9_Hw8rkO-vkPWFDhg8NkSF6At98_FY,3966
-renard_pipeline-0.3.1.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
-renard_pipeline-0.3.1.dist-info/METADATA,sha256=6yvKBMZicTd6q_ITaOCVRFj06jMD-a_dx53_3putUlg,2273
-renard_pipeline-0.3.1.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-renard_pipeline-0.3.1.dist-info/RECORD,,
+renard/utils.py,sha256=8J3swFqSi4YqhgYNXvttJ0s-DmJbl_yEYri6JpGEWH8,2340
+renard_pipeline-0.4.1.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
+renard_pipeline-0.4.1.dist-info/METADATA,sha256=KgpnPAR6BtLS4RNjsxIBWqUygUcoRdJfkqHigzZMSqU,3697
+renard_pipeline-0.4.1.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+renard_pipeline-0.4.1.dist-info/RECORD,,

{renard_pipeline-0.3.1.dist-info → renard_pipeline-0.4.1.dist-info}/LICENSE RENAMED Viewed

File without changes

{renard_pipeline-0.3.1.dist-info → renard_pipeline-0.4.1.dist-info}/WHEEL RENAMED Viewed

File without changes

renard-pipeline 0.3.1__py3-none-any.whl → 0.4.1__py3-none-any.whl

Potentially problematic release.

renard-pipeline 0.3.1py3-none-any.whl → 0.4.1py3-none-any.whl