PyPI - renard-pipeline - Versions diffs - 0.4.1__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

renard-pipeline 0.4.1py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of renard-pipeline might be problematic. Click here for more details.

Files changed (18) hide show

renard/graph_utils.py +11 -4
renard/ner_utils.py +4 -0
renard/pipeline/character_unification.py +26 -6
renard/pipeline/characters_extraction.py +3 -1
renard/pipeline/core.py +121 -26
renard/pipeline/corefs/corefs.py +30 -31
renard/pipeline/graph_extraction.py +281 -192
renard/pipeline/ner.py +3 -2
renard/pipeline/progress.py +32 -1
renard/pipeline/speaker_attribution.py +2 -3
renard/pipeline/tokenization.py +59 -30
renard/plot_utils.py +41 -28
renard/resources/hypocorisms/hypocorisms.py +3 -2
renard/utils.py +57 -1
{renard_pipeline-0.4.1.dist-info → renard_pipeline-0.5.0.dist-info}/METADATA +27 -3
{renard_pipeline-0.4.1.dist-info → renard_pipeline-0.5.0.dist-info}/RECORD +18 -18
{renard_pipeline-0.4.1.dist-info → renard_pipeline-0.5.0.dist-info}/WHEEL +1 -1
{renard_pipeline-0.4.1.dist-info → renard_pipeline-0.5.0.dist-info}/LICENSE +0 -0

renard/graph_utils.py CHANGED Viewed

@@ -70,10 +70,17 @@ def graph_with_names(
     else:
         name_style_fn = name_style
-    return nx.relabel_nodes(
-        G,
-        {character: name_style_fn(character) for character in G.nodes()},  # type: ignore
-    )
+    mapping = {}
+    for character in G.nodes():
+        # NOTE: it is *possible* to have a graph where nodes are not
+        # characters (for example, simple strings). Therefore, we are
+        # lenient here
+        try:
+            mapping[character] = name_style_fn(character)
+        except AttributeError:
+            mapping[character] = character
+    return nx.relabel_nodes(G, mapping)
 def layout_with_names(

renard/ner_utils.py CHANGED Viewed

@@ -110,6 +110,10 @@ class NERDataset(Dataset):
         elt_context_mask = self._context_mask[index]
         for i in range(len(element)):
             w2t = batch.word_to_tokens(0, i)
+            # w2t can be None in case of truncation, which can happen
+            # if `element' is too long
+            if w2t is None:
+                continue
             mask_value = elt_context_mask[i]
             tokens_mask = [mask_value] * (w2t.end - w2t.start)
             batch["context_mask"][w2t.start : w2t.end] = tokens_mask

renard/pipeline/character_unification.py CHANGED Viewed

@@ -61,6 +61,8 @@ def _assign_coreference_mentions(
     # we assign each chain to the character with highest name
     # occurence in it
     for chain in corefs:
+        if len(char_mentions) == 0:
+            break
         # determine the characters with the highest number of
         # occurences
         occ_counter = {}
@@ -98,8 +100,13 @@ class NaiveCharacterUnifier(PipelineStep):
             character for it to be valid
         """
         self.min_appearances = min_appearances
+        # a default value, will be est by _pipeline_init_
+        self.character_ner_tag = "PER"
         super().__init__()
+    def _pipeline_init_(self, lang: str, character_ner_tag: str, **kwargs):
+        self.character_ner_tag = character_ner_tag
     def __call__(
         self,
         text: str,
@@ -112,7 +119,7 @@ class NaiveCharacterUnifier(PipelineStep):
         :param tokens:
         :param entities:
         """
-        persons = [e for e in entities if e.tag == "PER"]
+        persons = [e for e in entities if e.tag == self.character_ner_tag]
         characters = defaultdict(list)
         for entity in persons:
@@ -159,6 +166,7 @@ class GraphRulesCharacterUnifier(PipelineStep):
         min_appearances: int = 0,
         additional_hypocorisms: Optional[List[Tuple[str, List[str]]]] = None,
         link_corefs_mentions: bool = False,
+        ignore_lone_titles: Optional[Set[str]] = None,
     ) -> None:
         """
         :param min_appearances: minimum number of appearances of a
@@ -173,20 +181,27 @@ class GraphRulesCharacterUnifier(PipelineStep):
             extract a lot of spurious links.  However, linking by
             coref is sometimes the only way to resolve a character
             alias.
+        :param ignore_lone_titles: a set of titles to ignore when
+            they stand on their own.  This avoids extracting false
+            positives characters such as 'Mr.' or 'Miss'.
         """
         self.min_appearances = min_appearances
         self.additional_hypocorisms = additional_hypocorisms
         self.link_corefs_mentions = link_corefs_mentions
+        self.ignore_lone_titles = ignore_lone_titles or set()
+        self.character_ner_tag = "PER"  # a default value, will be set by _pipeline_init
         super().__init__()
-    def _pipeline_init_(self, lang: str, progress_reporter: ProgressReporter):
+    def _pipeline_init_(self, lang: str, character_ner_tag: str, **kwargs):
         self.hypocorism_gazetteer = HypocorismGazetteer(lang=lang)
         if not self.additional_hypocorisms is None:
             for name, nicknames in self.additional_hypocorisms:
                 self.hypocorism_gazetteer._add_hypocorism_(name, nicknames)
-        return super()._pipeline_init_(lang, progress_reporter)
+        self.character_ner_tag = character_ner_tag
+        return super()._pipeline_init_(lang, **kwargs)
     def __call__(
         self,
@@ -196,12 +211,17 @@ class GraphRulesCharacterUnifier(PipelineStep):
     ) -> Dict[str, Any]:
         import networkx as nx
-        mentions = [m for m in entities if m.tag == "PER"]
-        mentions_str = [" ".join(m.tokens) for m in mentions]
+        mentions = [m for m in entities if m.tag == self.character_ner_tag]
+        mentions_str = set(
+            filter(
+                lambda m: not m in self.ignore_lone_titles,
+                map(lambda m: " ".join(m.tokens), mentions),
+            )
+        )
         # * create a graph where each node is a mention detected by NER
         G = nx.Graph()
-        for mention_str in set(mentions_str):
+        for mention_str in mentions_str:
             G.add_node(mention_str)
         # * HumanName local configuration - dependant on language

renard/pipeline/characters_extraction.py CHANGED Viewed

@@ -1,7 +1,9 @@
+import sys
 import renard.pipeline.character_unification as cu
 print(
-    "[warning] the characters_extraction module is deprecated. Use character_unification instead."
+    "[warning] the characters_extraction module is deprecated. Use character_unification instead.",
+    file=sys.stderr,
 )
 Character = cu.Character

renard/pipeline/core.py CHANGED Viewed

@@ -79,11 +79,18 @@ class PipelineStep:
         """Initialize the :class:`PipelineStep` with a given configuration."""
         pass
-    def _pipeline_init_(self, lang: str, progress_reporter: ProgressReporter):
-        """Set the step configuration that is common to the whole pipeline.
-        :param lang: ISO 639-3 language string
-        :param progress_report:
+    def _pipeline_init_(
+        self, lang: str, progress_reporter: ProgressReporter, **kwargs
+    ) -> Optional[Dict[Pipeline.PipelineParameter, Any]]:
+        """Set the step configuration that is common to the whole
+        pipeline.
+        :param lang: the lang of the whole pipeline
+        :param progress_reporter:
+        :param kwargs: additional pipeline parameters.
+        :return: a step can return a dictionary of pipeline params if
+                 it wish to modify some of these.
         """
         supported_langs = self.supported_langs()
         if not supported_langs == "any" and not lang in supported_langs:
@@ -150,13 +157,14 @@ class PipelineState:
     #: input text
     text: Optional[str]
-    #: text split into chapters
-    chapters: Optional[List[str]] = None
+    #: text split into blocks of texts. When dynamic blocks are given,
+    #: the final network is dynamic, and split according to blocks.
+    dynamic_blocks: Optional[List[Tuple[int, int]]] = None
     #: text splitted in tokens
     tokens: Optional[List[str]] = None
-    #: text splitted in tokens, by chapter
-    chapter_tokens: Optional[List[List[str]]] = None
+    #: mapping from a character to its corresponding token
+    char2token: Optional[List[int]] = None
     #: text splitted into sentences, each sentence being a list of
     #: tokens
     sentences: Optional[List[List[str]]] = None
@@ -182,14 +190,12 @@ class PipelineState:
     #: network)
     character_network: Optional[Union[List[nx.Graph], nx.Graph]] = None
+    # aliases of self.character_network
     def get_characters_graph(self) -> Optional[Union[List[nx.Graph], nx.Graph]]:
-        print(
-            "[warning] the characters_graph attribute is deprecated, use character_network instead",
-            file=sys.stderr,
-        )
         return self.character_network
     characters_graph = property(get_characters_graph)
+    character_graph = property(get_characters_graph)
     def get_character(
         self, name: str, partial_match: bool = True
@@ -280,6 +286,9 @@ class PipelineState:
         cumulative: bool = False,
         stable_layout: bool = False,
         layout: Optional[CharactersGraphLayout] = None,
+        node_kwargs: Optional[List[Dict[str, Any]]] = None,
+        edge_kwargs: Optional[List[Dict[str, Any]]] = None,
+        label_kwargs: Optional[List[Dict[str, Any]]] = None,
     ):
         """Plot ``self.character_graph`` using reasonable default
         parameters, and save the produced figures in the specified
@@ -294,6 +303,9 @@ class PipelineState:
             timestep.  Characters' positions are based on the final
             cumulative graph layout.
         :param layout: pre-computed graph layout
+        :param node_kwargs: passed to :func:`nx.draw_networkx_nodes`
+        :param edge_kwargs: passed to :func:`nx.draw_networkx_nodes`
+        :param label_kwargs: passed to :func:`nx.draw_networkx_labels`
         """
         import matplotlib.pyplot as plt
@@ -317,13 +329,24 @@ class PipelineState:
             )
             layout = layout_nx_graph_reasonably(layout_graph)
+        node_kwargs = node_kwargs or [{} for _ in range(len(self.character_network))]
+        edge_kwargs = edge_kwargs or [{} for _ in range(len(self.character_network))]
+        label_kwargs = label_kwargs or [{} for _ in range(len(self.character_network))]
         for i, G in enumerate(graphs):
             _, ax = plt.subplots()
             local_layout = layout
             if not local_layout is None:
                 local_layout = layout_with_names(G, local_layout, name_style)
             G = graph_with_names(G, name_style=name_style)
-            plot_nx_graph_reasonably(G, ax=ax, layout=local_layout)
+            plot_nx_graph_reasonably(
+                G,
+                ax=ax,
+                layout=local_layout,
+                node_kwargs=node_kwargs[i],
+                edge_kwargs=edge_kwargs[i],
+                label_kwargs=label_kwargs[i],
+            )
             plt.savefig(f"{directory}/{i}.png")
             plt.close()
@@ -335,6 +358,9 @@ class PipelineState:
         ] = "most_frequent",
         layout: Optional[CharactersGraphLayout] = None,
         fig: Optional[plt.Figure] = None,
+        node_kwargs: Optional[Dict[str, Any]] = None,
+        edge_kwargs: Optional[Dict[str, Any]] = None,
+        label_kwargs: Optional[Dict[str, Any]] = None,
     ):
         """Plot ``self.character_graph`` using reasonable parameters,
         and save the produced figure to a file
@@ -344,6 +370,9 @@ class PipelineState:
         :param layout: pre-computed graph layout
         :param fig: if specified, this matplotlib figure will be used
             for plotting
+        :param node_kwargs: passed to :func:`nx.draw_networkx_nodes`
+        :param edge_kwargs: passed to :func:`nx.draw_networkx_nodes`
+        :param label_kwargs: passed to :func:`nx.draw_networkx_labels`
         """
         import matplotlib.pyplot as plt
@@ -361,7 +390,14 @@ class PipelineState:
             fig.set_dpi(300)
             fig.set_size_inches(24, 24)
         ax = fig.add_subplot(111)
-        plot_nx_graph_reasonably(G, ax=ax, layout=layout)
+        plot_nx_graph_reasonably(
+            G,
+            ax=ax,
+            layout=layout,
+            node_kwargs=node_kwargs,
+            edge_kwargs=edge_kwargs,
+            label_kwargs=label_kwargs,
+        )
         plt.savefig(path)
         plt.close()
@@ -375,6 +411,9 @@ class PipelineState:
         graph_start_idx: int = 1,
         stable_layout: bool = False,
         layout: Optional[CharactersGraphLayout] = None,
+        node_kwargs: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
+        edge_kwargs: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
+        label_kwargs: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
     ):
         """Plot ``self.character_network`` using reasonable default
         parameters
@@ -400,6 +439,9 @@ class PipelineState:
             same position in space at each timestep.  Characters'
             positions are based on the final cumulative graph layout.
         :param layout: pre-computed graph layout
+        :param node_kwargs: passed to :func:`nx.draw_networkx_nodes`
+        :param edge_kwargs: passed to :func:`nx.draw_networkx_nodes`
+        :param label_kwargs: passed to :func:`nx.draw_networkx_labels`
         """
         import matplotlib.pyplot as plt
         from matplotlib.widgets import Slider
@@ -418,13 +460,30 @@ class PipelineState:
                 fig.set_dpi(300)
                 fig.set_size_inches(24, 24)
             ax = fig.add_subplot(111)
-            plot_nx_graph_reasonably(G, ax=ax, layout=layout)
+            assert not isinstance(node_kwargs, list)
+            assert not isinstance(edge_kwargs, list)
+            assert not isinstance(label_kwargs, list)
+            plot_nx_graph_reasonably(
+                G,
+                ax=ax,
+                layout=layout,
+                node_kwargs=node_kwargs,
+                edge_kwargs=edge_kwargs,
+                label_kwargs=label_kwargs,
+            )
             return
         if not isinstance(self.character_network, list):
             raise TypeError
         # self.character_network is a list: plot a dynamic graph
+        node_kwargs = node_kwargs or [{} for _ in range(len(self.character_network))]
+        assert isinstance(node_kwargs, list)
+        edge_kwargs = edge_kwargs or [{} for _ in range(len(self.character_network))]
+        assert isinstance(edge_kwargs, list)
+        label_kwargs = label_kwargs or [{} for _ in range(len(self.character_network))]
+        assert isinstance(label_kwargs, list)
         if fig is None:
             fig, ax = plt.subplots()
             assert not fig is None
@@ -440,12 +499,13 @@ class PipelineState:
         def update(slider_value):
             assert isinstance(self.character_network, list)
+            slider_i = int(slider_value) - 1
             character_networks = self.character_network
             if cumulative:
                 character_networks = cumulative_character_networks
-            G = character_networks[int(slider_value) - 1]
+            G = character_networks[slider_i]
             local_layout = layout
             if not local_layout is None:
@@ -453,7 +513,14 @@ class PipelineState:
             G = graph_with_names(G, name_style)
             ax.clear()
-            plot_nx_graph_reasonably(G, ax=ax, layout=local_layout)
+            plot_nx_graph_reasonably(
+                G,
+                ax=ax,
+                layout=local_layout,
+                node_kwargs=node_kwargs[slider_i],
+                edge_kwargs=edge_kwargs[slider_i],
+                label_kwargs=label_kwargs[slider_i],
+            )
             ax.set_xlim(-1.2, 1.2)
             ax.set_ylim(-1.2, 1.2)
@@ -474,6 +541,10 @@ class PipelineState:
 class Pipeline:
     """A flexible NLP pipeline"""
+    #: all the possible parameters of the whole pipeline, that are
+    #: shared between steps
+    PipelineParameter = Literal["lang", "progress_reporter", "character_ner_tag"]
     def __init__(
         self,
         steps: List[PipelineStep],
@@ -496,17 +567,27 @@ class Pipeline:
         self.progress_reporter = get_progress_reporter(progress_report)
         self.lang = lang
+        self.character_ner_tag = "PER"
         self.warn = warn
-    def _pipeline_init_steps(self, ignored_steps: Optional[List[str]] = None):
-        """
+    def _pipeline_init_steps_(self, ignored_steps: Optional[List[str]] = None):
+        """Initialise steps with global pipeline parameters.
         :param ignored_steps: a list of steps production.  All steps
             with a production in ``ignored_steps`` will be ignored.
         """
-        steps_progress_reporter = get_progress_reporter(self.progress_report)
+        steps_progress_reporter = self.progress_reporter.get_subreporter()
         steps = self._non_ignored_steps(ignored_steps)
+        pipeline_params = {
+            "progress_reporter": steps_progress_reporter,
+            "character_ner_tag": self.character_ner_tag,
+        }
         for step in steps:
-            step._pipeline_init_(self.lang, steps_progress_reporter)
+            step_additional_params = step._pipeline_init_(self.lang, **pipeline_params)
+            if not step_additional_params is None:
+                for key, value in step_additional_params.items():
+                    setattr(self, key, value)
+                    pipeline_params[key] = value
     def _non_ignored_steps(
         self, ignored_steps: Optional[List[str]]
@@ -549,13 +630,27 @@ class Pipeline:
                 return (
                     False,
                     [
-                        f"step {i + 1} ({step.__class__.__name__}) has unsatisfied needs (needs : {step.needs()}, available : {pipeline_state})"
+                        "".join(
+                            [
+                                f"step {i + 1} ({step.__class__.__name__}) has unsatisfied needs. "
+                                + f"needs: {step.needs()}. "
+                                + f"available: {pipeline_state}). "
+                                + f"missing: {step.needs() - pipeline_state}."
+                            ]
+                        ),
                     ],
                 )
             if not step.optional_needs().issubset(pipeline_state):
                 warnings.append(
-                    f"step {i + 1} ({step.__class__.__name__}) has unsatisfied optional needs : (optional needs : {step.optional_needs()}, available : {pipeline_state})"
+                    "".join(
+                        [
+                            f"step {i + 1} ({step.__class__.__name__}) has unsatisfied optional needs. "
+                            + f"needs: {step.optional_needs()}. "
+                            + f"available: {pipeline_state}). "
+                            + f"missing: {step.optional_needs() - pipeline_state}."
+                        ]
+                    )
                 )
             pipeline_state = pipeline_state.union(step.production())
@@ -582,9 +677,9 @@ class Pipeline:
             raise ValueError(warnings_or_errors)
         if self.warn:
             for warning in warnings_or_errors:
-                print(f"[warning] : {warning}")
+                print(f"[warning] : {warning}", file=sys.stderr)
-        self._pipeline_init_steps(ignored_steps)
+        self._pipeline_init_steps_(ignored_steps)
         state = PipelineState(text)
         # sets attributes to PipelineState dynamically. This ensures

renard/pipeline/corefs/corefs.py CHANGED Viewed

@@ -25,6 +25,7 @@ class BertCoreferenceResolver(PipelineStep):
         device: Literal["auto", "cuda", "cpu"] = "auto",
         tokenizer: Optional[PreTrainedTokenizerFast] = None,
         block_size: int = 512,
+        hierarchical_merging: bool = False,
     ) -> None:
         """
         .. note::
@@ -40,6 +41,10 @@ class BertCoreferenceResolver(PipelineStep):
         :param device: computation device
         :param block_size: size of blocks to pass to the coreference
             model
+        :param hierarchical_merging: if ``True``, attempts to use
+            tibert's hierarchical merging feature.  In that case,
+            blocks of size ``block_size`` are merged to perform
+            inference on the whole document.
         """
         if isinstance(model, str):
             self.hugginface_model_id = hugginface_model_id
@@ -58,15 +63,15 @@ class BertCoreferenceResolver(PipelineStep):
             self.device = torch.device(device)
         self.block_size = block_size
+        self.hierarchical_merging = hierarchical_merging
         super().__init__()
-    def _pipeline_init_(self, lang: str, progress_reporter: ProgressReporter):
+    def _pipeline_init_(self, lang: str, **kwargs):
         from tibert import BertForCoreferenceResolution
         from transformers import BertTokenizerFast, AutoTokenizer
         if self.model is None:
             # the user supplied a huggingface ID: load model from the HUB
             if not self.hugginface_model_id is None:
                 self.model = BertForCoreferenceResolution.from_pretrained(
@@ -87,16 +92,29 @@ class BertCoreferenceResolver(PipelineStep):
         assert not self.tokenizer is None
-        super()._pipeline_init_(lang, progress_reporter)
+        super()._pipeline_init_(lang, **kwargs)
     def __call__(self, tokens: List[str], **kwargs) -> Dict[str, Any]:
-        from tibert import stream_predict_coref
+        from tibert import stream_predict_coref, predict_coref
+        from tibert.bertcoref import CoreferenceDocument
         blocks = [
             tokens[block_start : block_start + self.block_size]
             for block_start in range(0, len(tokens), self.block_size)
         ]
+        if self.hierarchical_merging:
+            doc = predict_coref(
+                blocks,
+                self.model,
+                self.tokenizer,
+                batch_size=self.batch_size,
+                quiet=True,
+                device_str=self.device,
+                hierarchical_merging=True,
+            )
+            return {"corefs": doc.coref_chains}
         coref_docs = []
         for doc in self._progress_(
             stream_predict_coref(
@@ -111,26 +129,7 @@ class BertCoreferenceResolver(PipelineStep):
         ):
             coref_docs.append(doc)
-        # chains found in coref_docs are each local to their
-        # blocks. The following code adjusts their start and end index
-        # to match their global coordinate in the text.
-        coref_chains = []
-        cur_doc_start = 0
-        for doc in coref_docs:
-            for chain in doc.coref_chains:
-                adjusted_chain = []
-                for mention in chain:
-                    # FIXME: It seems that a rare bug in Tibert can
-                    # -----  sometimes produce this unwanted state.
-                    if mention.start_idx is None:
-                        mention.start_idx = 0
-                    start_idx = mention.start_idx + cur_doc_start
-                    end_idx = mention.end_idx + cur_doc_start
-                    adjusted_chain.append(Mention(mention.tokens, start_idx, end_idx))
-                coref_chains.append(adjusted_chain)
-            cur_doc_start += len(doc)
-        return {"corefs": coref_chains}
+        return {"corefs": CoreferenceDocument.concatenated(coref_docs).coref_chains}
     def needs(self) -> Set[str]:
         return {"tokens"}
@@ -239,19 +238,19 @@ class SpacyCorefereeCoreferenceResolver(PipelineStep):
         self,
         text: str,
         tokens: List[str],
-        chapter_tokens: Optional[List[List[str]]] = None,
+        dynamic_blocks_tokens: Optional[List[List[str]]] = None,
         **kwargs,
     ) -> Dict[str, Any]:
         from spacy.tokens import Doc
         from coreferee.manager import CorefereeBroker
-        if chapter_tokens is None:
-            chapter_tokens = [tokens]
+        if dynamic_blocks_tokens is None:
+            dynamic_blocks_tokens = [tokens]
-        if len(chapter_tokens) > 1:
+        if len(dynamic_blocks_tokens) > 1:
             chunks = []
-            for chapter in chapter_tokens:
-                chunks += self._cut_into_chunks(chapter)
+            for block in dynamic_blocks_tokens:
+                chunks += self._cut_into_chunks(block)
         else:
             chunks = self._cut_into_chunks(tokens)
@@ -317,7 +316,7 @@ class SpacyCorefereeCoreferenceResolver(PipelineStep):
         return {"tokens"}
     def optional_needs(self) -> Set[str]:
-        return {"chapter_tokens"}
+        return {"dynamic_blocks_tokens"}
     def production(self) -> Set[str]:
         return {"corefs"}

renard-pipeline 0.4.1__py3-none-any.whl → 0.5.0__py3-none-any.whl

Potentially problematic release.

renard-pipeline 0.4.1py3-none-any.whl → 0.5.0py3-none-any.whl