PyPI - renard-pipeline - Versions diffs - 0.4.2__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

renard-pipeline 0.4.2py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of renard-pipeline might be problematic. Click here for more details.

Files changed (23) hide show

renard/graph_utils.py +11 -4
renard/ner_utils.py +24 -14
renard/pipeline/character_unification.py +62 -19
renard/pipeline/characters_extraction.py +3 -1
renard/pipeline/core.py +141 -26
renard/pipeline/corefs/corefs.py +32 -33
renard/pipeline/graph_extraction.py +281 -192
renard/pipeline/ner/__init__.py +1 -0
renard/pipeline/{ner.py → ner/ner.py} +47 -76
renard/pipeline/ner/retrieval.py +375 -0
renard/pipeline/progress.py +32 -1
renard/pipeline/speaker_attribution.py +2 -3
renard/pipeline/tokenization.py +59 -30
renard/plot_utils.py +48 -28
renard/resources/determiners/__init__.py +1 -0
renard/resources/determiners/determiners.py +41 -0
renard/resources/hypocorisms/hypocorisms.py +3 -2
renard/utils.py +57 -1
{renard_pipeline-0.4.2.dist-info → renard_pipeline-0.6.0.dist-info}/METADATA +45 -20
renard_pipeline-0.6.0.dist-info/RECORD +39 -0
renard_pipeline-0.4.2.dist-info/RECORD +0 -35
{renard_pipeline-0.4.2.dist-info → renard_pipeline-0.6.0.dist-info}/LICENSE +0 -0
{renard_pipeline-0.4.2.dist-info → renard_pipeline-0.6.0.dist-info}/WHEEL +0 -0

renard/graph_utils.py CHANGED Viewed

@@ -70,10 +70,17 @@ def graph_with_names(
     else:
         name_style_fn = name_style
-    return nx.relabel_nodes(
-        G,
-        {character: name_style_fn(character) for character in G.nodes()},  # type: ignore
-    )
+    mapping = {}
+    for character in G.nodes():
+        # NOTE: it is *possible* to have a graph where nodes are not
+        # characters (for example, simple strings). Therefore, we are
+        # lenient here
+        try:
+            mapping[character] = name_style_fn(character)
+        except AttributeError:
+            mapping[character] = character
+    return nx.relabel_nodes(G, mapping)
 def layout_with_names(

renard/ner_utils.py CHANGED Viewed

@@ -74,7 +74,7 @@ class DataCollatorForTokenClassificationWithBatchEncoding:
 class NERDataset(Dataset):
     """
     :ivar _context_mask: for each element, a mask indicating which
-        tokens are part of the context (1 for context, 0 for text on
+        tokens are part of the context (0 for context, 1 for text on
         which to perform inference).  The mask allows to discard
         predictions made for context at inference time, even though
         the context can still be passed as input to the model.
@@ -92,11 +92,11 @@ class NERDataset(Dataset):
             assert all(
                 [len(cm) == len(elt) for elt, cm in zip(self.elements, context_mask)]
             )
-        self._context_mask = context_mask or [[0] * len(elt) for elt in self.elements]
+        self._context_mask = context_mask or [[1] * len(elt) for elt in self.elements]
         self.tokenizer = tokenizer
-    def __getitem__(self, index: Union[int, List[int]]) -> BatchEncoding:
+    def __getitem__(self, index: int) -> BatchEncoding:
         element = self.elements[index]
         batch = self.tokenizer(
@@ -104,15 +104,18 @@ class NERDataset(Dataset):
             truncation=True,
             max_length=512,  # TODO
             is_split_into_words=True,
+            return_length=True,
         )
-        batch["context_mask"] = [0] * len(batch["input_ids"])
-        elt_context_mask = self._context_mask[index]
-        for i in range(len(element)):
-            w2t = batch.word_to_tokens(0, i)
-            mask_value = elt_context_mask[i]
-            tokens_mask = [mask_value] * (w2t.end - w2t.start)
-            batch["context_mask"][w2t.start : w2t.end] = tokens_mask
+        length = batch["length"][0]
+        del batch["length"]
+        if self.tokenizer.truncation_side == "right":
+            batch["context_mask"] = self._context_mask[index][:length]
+        else:
+            assert self.tokenizer.truncation_side == "left"
+            batch["context_mask"] = self._context_mask[index][
+                len(batch["input_ids"]) - length :
+            ]
         return batch
@@ -181,6 +184,7 @@ def load_conll2002_bio(
     path: str,
     tag_conversion_map: Optional[Dict[str, str]] = None,
     separator: str = "\t",
+    max_sent_len: Optional[int] = None,
     **kwargs,
 ) -> Tuple[List[List[str]], List[str], List[NEREntity]]:
     """Load a file under CoNLL2022 BIO format.  Sentences are expected
@@ -192,7 +196,9 @@ def load_conll2002_bio(
     :param separator: separator between token and BIO tags
     :param tag_conversion_map: conversion map for tags found in the
         input file.  Example : ``{'B': 'B-PER', 'I': 'I-PER'}``
-    :param kwargs: additional kwargs for ``open`` (such as
+    :param max_sent_len: if specified, maximum length, in tokens, of
+        sentences.
+    :param kwargs: additional kwargs for :func:`open` (such as
         ``encoding`` or ``newline``).
     :return: ``(sentences, tokens, entities)``
@@ -207,7 +213,9 @@ def load_conll2002_bio(
     tags = []
     for line in raw_data.split("\n"):
         line = line.strip("\n")
-        if re.fullmatch(r"\s*", line):
+        if re.fullmatch(r"\s*", line) or (
+            not max_sent_len is None and len(sent_tokens) >= max_sent_len
+        ):
             if len(sent_tokens) == 0:
                 continue
             sents.append(sent_tokens)
@@ -227,6 +235,7 @@ def hgdataset_from_conll2002(
     path: str,
     tag_conversion_map: Optional[Dict[str, str]] = None,
     separator: str = "\t",
+    max_sent_len: Optional[int] = None,
     **kwargs,
 ) -> HGDataset:
     """Load a CoNLL-2002 file as a Huggingface Dataset.
@@ -234,12 +243,13 @@ def hgdataset_from_conll2002(
     :param path: passed to :func:`.load_conll2002_bio`
     :param tag_conversion_map: passed to :func:`load_conll2002_bio`
     :param separator: passed to :func:`load_conll2002_bio`
-    :param kwargs: passed to :func:`load_conll2002_bio`
+    :param max_sent_len: passed to :func:`load_conll2002_bio`
+    :param kwargs: additional kwargs for :func:`open`
     :return: a :class:`datasets.Dataset` with features 'tokens' and 'labels'.
     """
     sentences, tokens, entities = load_conll2002_bio(
-        path, tag_conversion_map, separator, **kwargs
+        path, tag_conversion_map, separator, max_sent_len, **kwargs
     )
     # convert entities to labels

renard/pipeline/character_unification.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from typing import Any, Dict, List, FrozenSet, Set, Optional, Tuple, Union, Literal
-import copy
+import re, sys
 from itertools import combinations
 from collections import defaultdict, Counter
 from dataclasses import dataclass
@@ -11,6 +11,7 @@ from renard.pipeline.ner import NEREntity
 from renard.pipeline.progress import ProgressReporter
 from renard.resources.hypocorisms import HypocorismGazetteer
 from renard.resources.pronouns import is_a_female_pronoun, is_a_male_pronoun
+from renard.resources.determiners import singular_determiners
 from renard.resources.titles import is_a_male_title, is_a_female_title, all_titles
@@ -61,6 +62,8 @@ def _assign_coreference_mentions(
     # we assign each chain to the character with highest name
     # occurence in it
     for chain in corefs:
+        if len(char_mentions) == 0:
+            break
         # determine the characters with the highest number of
         # occurences
         occ_counter = {}
@@ -98,8 +101,13 @@ class NaiveCharacterUnifier(PipelineStep):
             character for it to be valid
         """
         self.min_appearances = min_appearances
+        # a default value, will be est by _pipeline_init_
+        self.character_ner_tag = "PER"
         super().__init__()
+    def _pipeline_init_(self, lang: str, character_ner_tag: str, **kwargs):
+        self.character_ner_tag = character_ner_tag
     def __call__(
         self,
         text: str,
@@ -112,7 +120,7 @@ class NaiveCharacterUnifier(PipelineStep):
         :param tokens:
         :param entities:
         """
-        persons = [e for e in entities if e.tag == "PER"]
+        persons = [e for e in entities if e.tag == self.character_ner_tag]
         characters = defaultdict(list)
         for entity in persons:
@@ -160,6 +168,7 @@ class GraphRulesCharacterUnifier(PipelineStep):
         additional_hypocorisms: Optional[List[Tuple[str, List[str]]]] = None,
         link_corefs_mentions: bool = False,
         ignore_lone_titles: Optional[Set[str]] = None,
+        ignore_leading_determiner: bool = False,
     ) -> None:
         """
         :param min_appearances: minimum number of appearances of a
@@ -174,24 +183,32 @@ class GraphRulesCharacterUnifier(PipelineStep):
             extract a lot of spurious links.  However, linking by
             coref is sometimes the only way to resolve a character
             alias.
-        :param ignore_lone_titles: a set of titles to ignore when
-            they stand on their own.  This avoids extracting false
+        :param ignore_lone_titles: a set of titles to ignore when they
+            stand on their own.  This avoids extracting false
             positives characters such as 'Mr.' or 'Miss'.
+        :param ignore_leading_determiner: if ``True``, will ignore the
+            leading determiner when applying unification rules.  This
+            is useful if the NER model used in the pipeline adds
+            leading determiners as part of entites.
         """
         self.min_appearances = min_appearances
         self.additional_hypocorisms = additional_hypocorisms
         self.link_corefs_mentions = link_corefs_mentions
         self.ignore_lone_titles = ignore_lone_titles or set()
+        self.character_ner_tag = "PER"  # a default value, will be set by _pipeline_init
+        self.ignore_leading_determiner = ignore_leading_determiner
         super().__init__()
-    def _pipeline_init_(self, lang: str, progress_reporter: ProgressReporter):
+    def _pipeline_init_(self, lang: str, character_ner_tag: str, **kwargs):
         self.hypocorism_gazetteer = HypocorismGazetteer(lang=lang)
         if not self.additional_hypocorisms is None:
             for name, nicknames in self.additional_hypocorisms:
                 self.hypocorism_gazetteer._add_hypocorism_(name, nicknames)
-        return super()._pipeline_init_(lang, progress_reporter)
+        self.character_ner_tag = character_ner_tag
+        return super()._pipeline_init_(lang, **kwargs)
     def __call__(
         self,
@@ -201,7 +218,7 @@ class GraphRulesCharacterUnifier(PipelineStep):
     ) -> Dict[str, Any]:
         import networkx as nx
-        mentions = [m for m in entities if m.tag == "PER"]
+        mentions = [m for m in entities if m.tag == self.character_ner_tag]
         mentions_str = set(
             filter(
                 lambda m: not m in self.ignore_lone_titles,
@@ -219,23 +236,28 @@ class GraphRulesCharacterUnifier(PipelineStep):
         # * link nodes based on several rules
         for name1, name2 in combinations(G.nodes(), 2):
+            # preprocess name when needed
+            pname1 = self._preprocess_name(name1)
+            pname2 = self._preprocess_name(name2)
             # is one name a known hypocorism of the other ? (also
             # checks if both names are the same)
-            if self.hypocorism_gazetteer.are_related(name1, name2):
+            if self.hypocorism_gazetteer.are_related(pname1, pname2):
                 G.add_edge(name1, name2)
                 continue
             # if we remove the title, is one name related to the other
             # ?
             if self.names_are_related_after_title_removal(
-                name1, name2, hname_constants
+                pname1, pname2, hname_constants
             ):
                 G.add_edge(name1, name2)
                 continue
             # add an edge if two characters have the same family names
-            human_name1 = HumanName(name1, constants=hname_constants)
-            human_name2 = HumanName(name2, constants=hname_constants)
+            human_name1 = HumanName(pname1, constants=hname_constants)
+            human_name2 = HumanName(pname2, constants=hname_constants)
             if (
                 len(human_name1.last) > 0
                 and human_name1.last.lower() == human_name2.last.lower()
@@ -272,10 +294,15 @@ class GraphRulesCharacterUnifier(PipelineStep):
                 pass
         for name1, name2 in combinations(G.nodes(), 2):
+            # preprocess names when needed
+            pname1 = self._preprocess_name(name1)
+            pname2 = self._preprocess_name(name2)
             # check if characters have the same last name but a
             # different first name.
-            human_name1 = HumanName(name1, constants=hname_constants)
-            human_name2 = HumanName(name2, constants=hname_constants)
+            human_name1 = HumanName(pname1, constants=hname_constants)
+            human_name2 = HumanName(pname2, constants=hname_constants)
             if (
                 len(human_name1.last) > 0
                 and len(human_name2.last) > 0
@@ -327,6 +354,17 @@ class GraphRulesCharacterUnifier(PipelineStep):
         return {"characters": characters}
+    def _preprocess_name(self, name) -> str:
+        if self.ignore_leading_determiner:
+            if not self.lang in singular_determiners:
+                print(
+                    f"[warning] can't ignore leading determiners for {self.lang}",
+                    file=sys.stderr,
+                )
+            for determiner in singular_determiners.get(self.lang, []):
+                name = re.sub(f"^{determiner} ", " ", name, flags=re.I)
+        return name
     def _make_hname_constants(self) -> Constants:
         if self.lang == "eng":
             return Constants()
@@ -355,13 +393,18 @@ class GraphRulesCharacterUnifier(PipelineStep):
             or self.hypocorism_gazetteer.are_related(raw_name1, raw_name2)
         )
-    def names_are_in_coref(self, name1: str, name2: str, corefs: List[List[Mention]]):
+    def names_are_in_coref(
+        self, name1: str, name2: str, corefs: List[List[Mention]]
+    ) -> bool:
+        once_together = False
         for coref_chain in corefs:
-            if any([name1 == " ".join(m.tokens) for m in coref_chain]) and any(
-                [name2 == " ".join(m.tokens) for m in coref_chain]
-            ):
-                return True
-        return False
+            name1_in = any([name1 == " ".join(m.tokens) for m in coref_chain])
+            name2_in = any([name2 == " ".join(m.tokens) for m in coref_chain])
+            if name1_in == (not name2_in):
+                return False
+            elif name1_in and name2_in:
+                once_together = True
+        return once_together
     def infer_name_gender(
         self,

renard/pipeline/characters_extraction.py CHANGED Viewed

@@ -1,7 +1,9 @@
+import sys
 import renard.pipeline.character_unification as cu
 print(
-    "[warning] the characters_extraction module is deprecated. Use character_unification instead."
+    "[warning] the characters_extraction module is deprecated. Use character_unification instead.",
+    file=sys.stderr,
 )
 Character = cu.Character

renard/pipeline/core.py CHANGED Viewed

@@ -79,11 +79,18 @@ class PipelineStep:
         """Initialize the :class:`PipelineStep` with a given configuration."""
         pass
-    def _pipeline_init_(self, lang: str, progress_reporter: ProgressReporter):
-        """Set the step configuration that is common to the whole pipeline.
-        :param lang: ISO 639-3 language string
-        :param progress_report:
+    def _pipeline_init_(
+        self, lang: str, progress_reporter: ProgressReporter, **kwargs
+    ) -> Optional[Dict[Pipeline.PipelineParameter, Any]]:
+        """Set the step configuration that is common to the whole
+        pipeline.
+        :param lang: the lang of the whole pipeline
+        :param progress_reporter:
+        :param kwargs: additional pipeline parameters.
+        :return: a step can return a dictionary of pipeline params if
+                 it wish to modify some of these.
         """
         supported_langs = self.supported_langs()
         if not supported_langs == "any" and not lang in supported_langs:
@@ -150,13 +157,14 @@ class PipelineState:
     #: input text
     text: Optional[str]
-    #: text split into chapters
-    chapters: Optional[List[str]] = None
+    #: text split into blocks of texts. When dynamic blocks are given,
+    #: the final network is dynamic, and split according to blocks.
+    dynamic_blocks: Optional[List[Tuple[int, int]]] = None
     #: text splitted in tokens
     tokens: Optional[List[str]] = None
-    #: text splitted in tokens, by chapter
-    chapter_tokens: Optional[List[List[str]]] = None
+    #: mapping from a character to its corresponding token
+    char2token: Optional[List[int]] = None
     #: text splitted into sentences, each sentence being a list of
     #: tokens
     sentences: Optional[List[List[str]]] = None
@@ -182,14 +190,12 @@ class PipelineState:
     #: network)
     character_network: Optional[Union[List[nx.Graph], nx.Graph]] = None
+    # aliases of self.character_network
     def get_characters_graph(self) -> Optional[Union[List[nx.Graph], nx.Graph]]:
-        print(
-            "[warning] the characters_graph attribute is deprecated, use character_network instead",
-            file=sys.stderr,
-        )
         return self.character_network
     characters_graph = property(get_characters_graph)
+    character_graph = property(get_characters_graph)
     def get_character(
         self, name: str, partial_match: bool = True
@@ -280,6 +286,10 @@ class PipelineState:
         cumulative: bool = False,
         stable_layout: bool = False,
         layout: Optional[CharactersGraphLayout] = None,
+        node_kwargs: Optional[List[Dict[str, Any]]] = None,
+        edge_kwargs: Optional[List[Dict[str, Any]]] = None,
+        label_kwargs: Optional[List[Dict[str, Any]]] = None,
+        legend: bool = False,
     ):
         """Plot ``self.character_graph`` using reasonable default
         parameters, and save the produced figures in the specified
@@ -294,6 +304,10 @@ class PipelineState:
             timestep.  Characters' positions are based on the final
             cumulative graph layout.
         :param layout: pre-computed graph layout
+        :param node_kwargs: passed to :func:`nx.draw_networkx_nodes`
+        :param edge_kwargs: passed to :func:`nx.draw_networkx_nodes`
+        :param label_kwargs: passed to :func:`nx.draw_networkx_labels`
+        :param legend: passed to :func:`.plot_nx_graph_reasonably`
         """
         import matplotlib.pyplot as plt
@@ -317,13 +331,25 @@ class PipelineState:
             )
             layout = layout_nx_graph_reasonably(layout_graph)
+        node_kwargs = node_kwargs or [{} for _ in range(len(self.character_network))]
+        edge_kwargs = edge_kwargs or [{} for _ in range(len(self.character_network))]
+        label_kwargs = label_kwargs or [{} for _ in range(len(self.character_network))]
         for i, G in enumerate(graphs):
             _, ax = plt.subplots()
             local_layout = layout
             if not local_layout is None:
                 local_layout = layout_with_names(G, local_layout, name_style)
             G = graph_with_names(G, name_style=name_style)
-            plot_nx_graph_reasonably(G, ax=ax, layout=local_layout)
+            plot_nx_graph_reasonably(
+                G,
+                ax=ax,
+                layout=local_layout,
+                node_kwargs=node_kwargs[i],
+                edge_kwargs=edge_kwargs[i],
+                label_kwargs=label_kwargs[i],
+                legend=legend,
+            )
             plt.savefig(f"{directory}/{i}.png")
             plt.close()
@@ -335,6 +361,11 @@ class PipelineState:
         ] = "most_frequent",
         layout: Optional[CharactersGraphLayout] = None,
         fig: Optional[plt.Figure] = None,
+        node_kwargs: Optional[Dict[str, Any]] = None,
+        edge_kwargs: Optional[Dict[str, Any]] = None,
+        label_kwargs: Optional[Dict[str, Any]] = None,
+        tight_layout: bool = False,
+        legend: bool = False,
     ):
         """Plot ``self.character_graph`` using reasonable parameters,
         and save the produced figure to a file
@@ -344,6 +375,11 @@ class PipelineState:
         :param layout: pre-computed graph layout
         :param fig: if specified, this matplotlib figure will be used
             for plotting
+        :param node_kwargs: passed to :func:`nx.draw_networkx_nodes`
+        :param edge_kwargs: passed to :func:`nx.draw_networkx_nodes`
+        :param label_kwargs: passed to :func:`nx.draw_networkx_labels`
+        :param tight_layout: if ``True``, will use matplotlib's tight_layout
+        :param legend: passed to :func:`.plot_nx_graph_reasonably`
         """
         import matplotlib.pyplot as plt
@@ -361,7 +397,17 @@ class PipelineState:
             fig.set_dpi(300)
             fig.set_size_inches(24, 24)
         ax = fig.add_subplot(111)
-        plot_nx_graph_reasonably(G, ax=ax, layout=layout)
+        plot_nx_graph_reasonably(
+            G,
+            ax=ax,
+            layout=layout,
+            node_kwargs=node_kwargs,
+            edge_kwargs=edge_kwargs,
+            label_kwargs=label_kwargs,
+            legend=legend,
+        )
+        if tight_layout:
+            fig.tight_layout()
         plt.savefig(path)
         plt.close()
@@ -375,6 +421,11 @@ class PipelineState:
         graph_start_idx: int = 1,
         stable_layout: bool = False,
         layout: Optional[CharactersGraphLayout] = None,
+        node_kwargs: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
+        edge_kwargs: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
+        label_kwargs: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
+        tight_layout: bool = False,
+        legend: bool = False,
     ):
         """Plot ``self.character_network`` using reasonable default
         parameters
@@ -400,6 +451,11 @@ class PipelineState:
             same position in space at each timestep.  Characters'
             positions are based on the final cumulative graph layout.
         :param layout: pre-computed graph layout
+        :param node_kwargs: passed to :func:`nx.draw_networkx_nodes`
+        :param edge_kwargs: passed to :func:`nx.draw_networkx_nodes`
+        :param label_kwargs: passed to :func:`nx.draw_networkx_labels`
+        :param tight_layout: if ``True``, will use matplotlib's tight_layout
+        :param legend: passed to :func:`.plot_nx_graph_reasonably`
         """
         import matplotlib.pyplot as plt
         from matplotlib.widgets import Slider
@@ -418,13 +474,33 @@ class PipelineState:
                 fig.set_dpi(300)
                 fig.set_size_inches(24, 24)
             ax = fig.add_subplot(111)
-            plot_nx_graph_reasonably(G, ax=ax, layout=layout)
+            assert not isinstance(node_kwargs, list)
+            assert not isinstance(edge_kwargs, list)
+            assert not isinstance(label_kwargs, list)
+            if tight_layout:
+                fig.tight_layout()
+            plot_nx_graph_reasonably(
+                G,
+                ax=ax,
+                layout=layout,
+                node_kwargs=node_kwargs,
+                edge_kwargs=edge_kwargs,
+                label_kwargs=label_kwargs,
+                legend=legend,
+            )
             return
         if not isinstance(self.character_network, list):
             raise TypeError
         # self.character_network is a list: plot a dynamic graph
+        node_kwargs = node_kwargs or [{} for _ in range(len(self.character_network))]
+        assert isinstance(node_kwargs, list)
+        edge_kwargs = edge_kwargs or [{} for _ in range(len(self.character_network))]
+        assert isinstance(edge_kwargs, list)
+        label_kwargs = label_kwargs or [{} for _ in range(len(self.character_network))]
+        assert isinstance(label_kwargs, list)
         if fig is None:
             fig, ax = plt.subplots()
             assert not fig is None
@@ -440,12 +516,13 @@ class PipelineState:
         def update(slider_value):
             assert isinstance(self.character_network, list)
+            slider_i = int(slider_value) - 1
             character_networks = self.character_network
             if cumulative:
                 character_networks = cumulative_character_networks
-            G = character_networks[int(slider_value) - 1]
+            G = character_networks[slider_i]
             local_layout = layout
             if not local_layout is None:
@@ -453,11 +530,21 @@ class PipelineState:
             G = graph_with_names(G, name_style)
             ax.clear()
-            plot_nx_graph_reasonably(G, ax=ax, layout=local_layout)
+            plot_nx_graph_reasonably(
+                G,
+                ax=ax,
+                layout=local_layout,
+                node_kwargs=node_kwargs[slider_i],
+                edge_kwargs=edge_kwargs[slider_i],
+                label_kwargs=label_kwargs[slider_i],
+                legend=legend,
+            )
             ax.set_xlim(-1.2, 1.2)
             ax.set_ylim(-1.2, 1.2)
         slider_ax = fig.add_axes([0.1, 0.05, 0.8, 0.04])
+        if tight_layout:
+            fig.tight_layout()
         # HACK: we save the slider to the figure. This ensure the
         # slider is still alive at plotting time.
         fig.slider = Slider(  # type: ignore
@@ -474,6 +561,10 @@ class PipelineState:
 class Pipeline:
     """A flexible NLP pipeline"""
+    #: all the possible parameters of the whole pipeline, that are
+    #: shared between steps
+    PipelineParameter = Literal["lang", "progress_reporter", "character_ner_tag"]
     def __init__(
         self,
         steps: List[PipelineStep],
@@ -496,17 +587,27 @@ class Pipeline:
         self.progress_reporter = get_progress_reporter(progress_report)
         self.lang = lang
+        self.character_ner_tag = "PER"
         self.warn = warn
-    def _pipeline_init_steps(self, ignored_steps: Optional[List[str]] = None):
-        """
+    def _pipeline_init_steps_(self, ignored_steps: Optional[List[str]] = None):
+        """Initialise steps with global pipeline parameters.
         :param ignored_steps: a list of steps production.  All steps
             with a production in ``ignored_steps`` will be ignored.
         """
-        steps_progress_reporter = get_progress_reporter(self.progress_report)
+        steps_progress_reporter = self.progress_reporter.get_subreporter()
         steps = self._non_ignored_steps(ignored_steps)
+        pipeline_params = {
+            "progress_reporter": steps_progress_reporter,
+            "character_ner_tag": self.character_ner_tag,
+        }
         for step in steps:
-            step._pipeline_init_(self.lang, steps_progress_reporter)
+            step_additional_params = step._pipeline_init_(self.lang, **pipeline_params)
+            if not step_additional_params is None:
+                for key, value in step_additional_params.items():
+                    setattr(self, key, value)
+                    pipeline_params[key] = value
     def _non_ignored_steps(
         self, ignored_steps: Optional[List[str]]
@@ -549,13 +650,27 @@ class Pipeline:
                 return (
                     False,
                     [
-                        f"step {i + 1} ({step.__class__.__name__}) has unsatisfied needs (needs : {step.needs()}, available : {pipeline_state})"
+                        "".join(
+                            [
+                                f"step {i + 1} ({step.__class__.__name__}) has unsatisfied needs. "
+                                + f"needs: {step.needs()}. "
+                                + f"available: {pipeline_state}). "
+                                + f"missing: {step.needs() - pipeline_state}."
+                            ]
+                        ),
                     ],
                 )
             if not step.optional_needs().issubset(pipeline_state):
                 warnings.append(
-                    f"step {i + 1} ({step.__class__.__name__}) has unsatisfied optional needs : (optional needs : {step.optional_needs()}, available : {pipeline_state})"
+                    "".join(
+                        [
+                            f"step {i + 1} ({step.__class__.__name__}) has unsatisfied optional needs. "
+                            + f"needs: {step.optional_needs()}. "
+                            + f"available: {pipeline_state}). "
+                            + f"missing: {step.optional_needs() - pipeline_state}."
+                        ]
+                    )
                 )
             pipeline_state = pipeline_state.union(step.production())
@@ -582,9 +697,9 @@ class Pipeline:
             raise ValueError(warnings_or_errors)
         if self.warn:
             for warning in warnings_or_errors:
-                print(f"[warning] : {warning}")
+                print(f"[warning] : {warning}", file=sys.stderr)
-        self._pipeline_init_steps(ignored_steps)
+        self._pipeline_init_steps_(ignored_steps)
         state = PipelineState(text)
         # sets attributes to PipelineState dynamically. This ensures

renard-pipeline 0.4.2__py3-none-any.whl → 0.6.0__py3-none-any.whl

Potentially problematic release.

renard-pipeline 0.4.2py3-none-any.whl → 0.6.0py3-none-any.whl