PyPI - renard-pipeline - Versions diffs - 0.4.2__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

renard-pipeline 0.4.2py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of renard-pipeline might be problematic. Click here for more details.

Files changed (23) hide show

renard/graph_utils.py +11 -4
renard/ner_utils.py +24 -14
renard/pipeline/character_unification.py +62 -19
renard/pipeline/characters_extraction.py +3 -1
renard/pipeline/core.py +141 -26
renard/pipeline/corefs/corefs.py +32 -33
renard/pipeline/graph_extraction.py +281 -192
renard/pipeline/ner/__init__.py +1 -0
renard/pipeline/{ner.py → ner/ner.py} +47 -76
renard/pipeline/ner/retrieval.py +375 -0
renard/pipeline/progress.py +32 -1
renard/pipeline/speaker_attribution.py +2 -3
renard/pipeline/tokenization.py +59 -30
renard/plot_utils.py +48 -28
renard/resources/determiners/__init__.py +1 -0
renard/resources/determiners/determiners.py +41 -0
renard/resources/hypocorisms/hypocorisms.py +3 -2
renard/utils.py +57 -1
{renard_pipeline-0.4.2.dist-info → renard_pipeline-0.6.0.dist-info}/METADATA +45 -20
renard_pipeline-0.6.0.dist-info/RECORD +39 -0
renard_pipeline-0.4.2.dist-info/RECORD +0 -35
{renard_pipeline-0.4.2.dist-info → renard_pipeline-0.6.0.dist-info}/LICENSE +0 -0
{renard_pipeline-0.4.2.dist-info → renard_pipeline-0.6.0.dist-info}/WHEEL +0 -0

renard/pipeline/graph_extraction.py CHANGED Viewed

@@ -1,12 +1,12 @@
-import itertools
 from typing import Dict, Any, List, Set, Optional, Tuple, Literal, Union
+import itertools as it
 import operator
-from itertools import accumulate
 import networkx as nx
 import numpy as np
 from more_itertools import windowed
+from renard.utils import BlockBounds, charbb2tokenbb
 from renard.pipeline.ner import NEREntity
 from renard.pipeline.core import PipelineStep
 from renard.pipeline.character_unification import Character
@@ -15,65 +15,63 @@ from renard.pipeline.quote_detection import Quote
 def sent_index_for_token_index(token_index: int, sentences: List[List[str]]) -> int:
     """Compute the index of the sentence of the token at ``token_index``"""
-    sents_len = accumulate([len(s) for s in sentences], operator.add)
+    sents_len = it.accumulate([len(s) for s in sentences], operator.add)
     return next((i for i, l in enumerate(sents_len) if l > token_index))
-def sent_indices_for_chapter(
-    chapters: List[List[str]], chapter_idx: int, sentences: List[List[str]]
+def sent_indices_for_block(
+    dynamic_block: Tuple[int, int], sentences: List[List[str]]
 ) -> Tuple[int, int]:
     """Return the indices of the first and the last sentence of a
-    chapter
+    block
-    :param chapters: all chapters
-    :param chapter_idx: index of the chapter for which sentence
-        indices are returned
-    :param sentences: all sentences
+    :param dynamic_block: (START, END) in tokens
     :return: ``(first sentence index, last sentence index)``
     """
-    chapter_start_idx = sum([len(c) for i, c in enumerate(chapters) if i < chapter_idx])
-    chapter_end_idx = chapter_start_idx + len(chapters[chapter_idx])
-    sents_start_idx = None
-    sents_end_idx = None
+    block_start, block_end = dynamic_block
+    sents_start = None
+    sents_end = None
     count = 0
     for sent_i, sent in enumerate(sentences):
-        start_idx, end_idx = (count, count + len(sent))
-        count = end_idx
-        if sents_start_idx is None and start_idx >= chapter_start_idx:
-            sents_start_idx = sent_i
-        if sents_end_idx is None and end_idx >= chapter_end_idx:
-            sents_end_idx = sent_i
+        start, end = (count, count + len(sent))
+        count = end
+        if sents_start is None and start >= block_start:
+            sents_start = sent_i
+        if sents_end is None and end >= block_end:
+            # this happens when the block is _smaller_ than the
+            # current sentence. In that case, we return the current
+            # sentence even though it overflows the block.
+            if sents_start is None:
+                sents_start = sent_i
+            sents_end = sent_i
             break
-    assert not sents_start_idx is None and not sents_end_idx is None
-    return (sents_start_idx, sents_end_idx)
+    assert not sents_start is None and not sents_end is None
+    return (sents_start, sents_end)
-def mentions_for_chapters(
-    chapters: List[List[str]],
-    mentions: List[Tuple[Character, NEREntity]],
-) -> List[List[Tuple[Character, NEREntity]]]:
-    """Return each chapter mentions
+def mentions_for_blocks(
+    block_bounds: BlockBounds,
+    mentions: List[Tuple[Any, NEREntity]],
+) -> List[List[Tuple[Any, NEREntity]]]:
+    """Return each block mentions.
-    :param chapters:
+    :param block_bounds: block bounds, in tokens
     :param mentions:
-    :return: a list of mentions per chapters.  This list has len
-             ``len(chapters)``.
+    :return: a list of mentions per blocks.  This list has len
+             ``len(block_bounds)``.
     """
-    chapters_mentions = [[] for _ in range(len(chapters))]
+    assert block_bounds[1] == "tokens"
-    start_indices = list(
-        itertools.accumulate([0] + [len(chapter) for chapter in chapters[:-1]])
-    )
-    end_indices = start_indices[1:] + [start_indices[-1] + len(chapters[-1])]
+    blocks_mentions = [[] for _ in range(len(block_bounds[0]))]
     for mention in mentions:
-        for chapter_i, (start_i, end_i) in enumerate(zip(start_indices, end_indices)):
+        for block_i, (start_i, end_i) in enumerate(block_bounds[0]):
             if mention[1].start_idx >= start_i and mention[1].end_idx < end_i:
-                chapters_mentions[chapter_i].append(mention)
+                blocks_mentions[block_i].append(mention)
                 break
-    return chapters_mentions
+    return blocks_mentions
 class CoOccurrencesGraphExtractor(PipelineStep):
@@ -83,13 +81,11 @@ class CoOccurrencesGraphExtractor(PipelineStep):
         self,
         co_occurrences_dist: Optional[
             Union[int, Tuple[int, Literal["tokens", "sentences"]]]
-        ],
+        ] = None,
         dynamic: bool = False,
         dynamic_window: Optional[int] = None,
         dynamic_overlap: int = 0,
-        co_occurences_dist: Optional[
-            Union[int, Tuple[int, Literal["tokens", "sentences"]]]
-        ] = None,
+        additional_ner_classes: Optional[List[str]] = None,
     ) -> None:
         """
         :param co_occurrences_dist: max accepted distance between two
@@ -111,10 +107,10 @@ class CoOccurrencesGraphExtractor(PipelineStep):
                   that case, ``dynamic_window`` and
                   ``dynamic_overlap``*can* be specified.  If
                   ``dynamic_window`` is not specified, this step is
-                  expecting the text to be cut into chapters', and a
-                  graph will be extracted for each 'chapter'.  In that
-                  case, ``chapters`` must be passed to the pipeline as
-                  a ``List[str]`` at runtime.
+                  expecting the text to be cut into 'dynamic blocks',
+                  and a graph will be extracted for each block.  In
+                  that case, ``dynamic_blocks`` must be passed to the
+                  pipeline as a ``List[str]`` at runtime.
         :param dynamic_window: dynamic window, in number of
             interactions.  a dynamic window of `n` means that each
@@ -122,19 +118,15 @@ class CoOccurrencesGraphExtractor(PipelineStep):
         :param dynamic_overlap: overlap, in number of interactions.
-        :param co_occurences_dist: same as ``co_occurrences_dist``.
-            Included because of retro-compatibility, as it was a
-            previously included typo.
+        :param additional_ner_classes: if specified, will include
+            entities other than characters in the final graph.  No
+            attempt will be made at unifying the entities (for example,
+            "New York" will be distinct from "New York City").
         """
-        # typo retrocompatibility
-        if not co_occurences_dist is None:
-            co_occurrences_dist = co_occurences_dist
-        if co_occurrences_dist is None and co_occurences_dist is None:
-            raise ValueError()
         if isinstance(co_occurrences_dist, int):
             co_occurrences_dist = (co_occurrences_dist, "tokens")
         self.co_occurrences_dist = co_occurrences_dist
+        self.need_co_occurrences_blocks = co_occurrences_dist is None
         if dynamic:
             if not dynamic_window is None:
@@ -143,89 +135,135 @@ class CoOccurrencesGraphExtractor(PipelineStep):
         self.dynamic = dynamic
         self.dynamic_window = dynamic_window
         self.dynamic_overlap = dynamic_overlap
-        self.dynamic_needs_chapter = dynamic == "nx" and dynamic_window is None
+        self.need_dynamic_blocks = dynamic and dynamic_window is None
+        self.additional_ner_classes = additional_ner_classes or []
         super().__init__()
     def __call__(
         self,
         characters: Set[Character],
         sentences: List[List[str]],
-        chapter_tokens: Optional[List[List[str]]] = None,
+        char2token: Optional[List[int]] = None,
+        dynamic_blocks: Optional[BlockBounds] = None,
         sentences_polarities: Optional[List[float]] = None,
+        entities: Optional[List[NEREntity]] = None,
+        co_occurrences_blocks: Optional[BlockBounds] = None,
         **kwargs,
     ) -> Dict[str, Any]:
-        """Extract a characters graph
+        """Extract a co-occurrence character network.
-        :param characters:
+        :param co_occurrences_blocks: custom blocks where
+            co-occurrences should be recorded.  For example, this can
+            be used to perform chapter level co-occurrences.
         :return: a ``dict`` with key ``'character_network'`` and a
-            :class:`nx.Graph` or a list of :class:`nx.Graph` as
-            value.
+                 :class:`nx.Graph` or a list of :class:`nx.Graph` as
+                 value.
         """
         mentions = []
         for character in characters:
             for mention in character.mentions:
                 mentions.append((character, mention))
+        if len(self.additional_ner_classes) > 0:
+            assert not entities is None
+            for entity in entities:
+                if entity.tag in self.additional_ner_classes:
+                    mentions.append((" ".join(entity.tokens), entity))
         mentions = sorted(mentions, key=lambda cm: cm[1].start_idx)
+        # convert from char blocks to token blocks
+        if not dynamic_blocks is None and dynamic_blocks[1] == "characters":
+            assert not char2token is None
+            dynamic_blocks = charbb2tokenbb(dynamic_blocks, char2token)
+        if (
+            not co_occurrences_blocks is None
+            and co_occurrences_blocks[1] == "characters"
+        ):
+            assert not char2token is None
+            co_occurrences_blocks = charbb2tokenbb(co_occurrences_blocks, char2token)
         if self.dynamic:
             return {
                 "character_network": self._extract_dynamic_graph(
                     mentions,
                     self.dynamic_window,
                     self.dynamic_overlap,
-                    chapter_tokens,
+                    dynamic_blocks,
                     sentences,
                     sentences_polarities,
+                    co_occurrences_blocks,
                 )
             }
         return {
             "character_network": self._extract_graph(
-                mentions, sentences, sentences_polarities
+                mentions, sentences, sentences_polarities, co_occurrences_blocks
             )
         }
-    def _mentions_interact(
-        self,
-        mention_1: NEREntity,
-        mention_2: NEREntity,
-        sentences: Optional[List[List[str]]] = None,
-    ) -> bool:
-        """Check if two mentions are close enough to be in interactions.
-        .. note::
-            the attribute ``self.co_occurrences_dist`` is used to know wether mentions are in co_occurences
+    def _create_co_occurrences_blocks(
+        self, sentences: List[List[str]], mentions: List[Tuple[Any, NEREntity]]
+    ) -> BlockBounds:
+        """Create co-occurrences blocks using
+        ``self.co_occurrences_dist``.  All entities within a block are
+        considered as co-occurring.
-        :param mention_1:
-        :param mention_2:
         :param sentences:
-        :return: a boolean indicating wether the two mentions are co-occuring
         """
-        if self.co_occurrences_dist[1] == "tokens":
-            return (
-                abs(mention_2.start_idx - mention_1.start_idx)
-                <= self.co_occurrences_dist[0]
-            )
-        elif self.co_occurrences_dist[1] == "sentences":
-            assert not sentences is None
-            mention_1_sent = sent_index_for_token_index(mention_1.start_idx, sentences)
-            mention_2_sent = sent_index_for_token_index(
-                mention_2.end_idx - 1, sentences
-            )
-            return abs(mention_2_sent - mention_1_sent) <= self.co_occurrences_dist[0]
+        assert not self.co_occurrences_dist is None
+        dist_unit = self.co_occurrences_dist[1]
+        if dist_unit == "tokens":
+            tokens_dist = self.co_occurrences_dist[0]
+            blocks = []
+            for _, entity in mentions:
+                block_start = entity.start_idx - tokens_dist
+                block_end = entity.end_idx + tokens_dist
+                blocks.append((block_start, block_end))
+            return (blocks, "tokens")
+        elif dist_unit == "sentences":
+            blocks_indices = set()
+            sent_dist = self.co_occurrences_dist[0]
+            for _, entity in mentions:
+                start_sent_i = max(
+                    0,
+                    sent_index_for_token_index(entity.start_idx, sentences) - sent_dist,
+                )
+                start_token_i = sum(len(sent) for sent in sentences[:start_sent_i])
+                end_sent_i = min(
+                    len(sentences) - 1,
+                    sent_index_for_token_index(entity.end_idx - 1, sentences)
+                    + sent_dist,
+                )
+                end_token_i = sum(len(sent) for sent in sentences[: end_sent_i + 1])
+                blocks_indices.add((start_token_i, end_token_i))
+            blocks = [
+                (start, end)
+                for start, end in sorted(blocks_indices, key=lambda indices: indices[0])
+            ]
+            return (blocks, "tokens")
         else:
-            raise NotImplementedError
+            raise ValueError(
+                f"co_occurrences_dist unit should be one of: 'tokens', 'sentences'"
+            )
     def _extract_graph(
         self,
-        mentions: List[Tuple[Character, NEREntity]],
+        mentions: List[Tuple[Any, NEREntity]],
         sentences: List[List[str]],
         sentences_polarities: Optional[List[float]],
-    ):
+        co_occurrences_blocks: Optional[BlockBounds],
+    ) -> nx.Graph:
         """
-        :param mentions: A list of character mentions, ordered by
-            appearance
+        :param mentions: A list of entity mentions, ordered by
+            appearance, each of the form (KEY MENTION).  KEY
+            determines the unicity of the entity.
         :param sentences: if specified, ``sentences_polarities`` must
             be specified as well.
         :param sentences_polarities: if specified, ``sentences`` must
@@ -234,25 +272,37 @@ class CoOccurrencesGraphExtractor(PipelineStep):
             of the relationship between two characters.  Polarity
             between two interactions is computed as the strongest
             sentence polarity between those two mentions.
+        :param co_occurrences_blocks: only unit 'tokens' is accepted.
         """
         compute_polarity = not sentences_polarities is None
+        assert co_occurrences_blocks is None or co_occurrences_blocks[1] == "tokens"
+        if co_occurrences_blocks is None:
+            co_occurrences_blocks = self._create_co_occurrences_blocks(
+                sentences, mentions
+            )
         # co-occurence matrix, where C[i][j] is 1 when appearance
         # i co-occur with j if i < j, or 0 when it doesn't
         C = np.zeros((len(mentions), len(mentions)))
-        for i, (char1, mention_1) in enumerate(mentions):
-            # check ahead for co-occurences
-            for j, (char2, mention_2) in enumerate(mentions[i + 1 :]):
-                if not self._mentions_interact(mention_1, mention_2, sentences):
-                    # dist between current token and future token is
-                    # too great : we finished co-occurences search for
-                    # the current token
+        for block_start, block_end in co_occurrences_blocks[0]:
+            # collect all mentions in this co-occurrences block
+            block_mentions = []
+            for i, (key, mention) in enumerate(mentions):
+                if mention.start_idx >= block_start and mention.end_idx <= block_end:
+                    block_mentions.append((i, key, mention))
+                # since mentions are ordered, the first mention
+                # outside of the blocks ends the search inside this block
+                if mention.start_idx > block_end:
                     break
-                # ignore co-occurences with self
-                if char1 == char2:
+            # assign mentions in this co-occurrences blocks to C
+            for m1, m2 in it.combinations(block_mentions, 2):
+                i1, key1, mention1 = m1
+                i2, key2, mention2 = m2
+                # ignore co-occurrence with self
+                if key1 == key2:
                     continue
-                # record co_occurence
-                C[i][i + 1 + j] = 1
+                C[i1][i2] = 1
         # * Construct graph from co-occurence matrix
         G = nx.Graph()
@@ -291,25 +341,29 @@ class CoOccurrencesGraphExtractor(PipelineStep):
     def _extract_dynamic_graph(
         self,
-        mentions: List[Tuple[Character, NEREntity]],
+        mentions: List[Tuple[Any, NEREntity]],
         window: Optional[int],
         overlap: int,
-        chapter_tokens: Optional[List[List[str]]],
+        dynamic_blocks: Optional[BlockBounds],
         sentences: List[List[str]],
         sentences_polarities: Optional[List[float]],
+        co_occurrences_blocks: Optional[BlockBounds],
     ) -> List[nx.Graph]:
         """
         .. note::
-            only one of ``window`` or ``chapter_tokens`` should be specified
+            only one of ``window`` or ``dynamic_blocks_tokens`` should be specified
-        :param mentions: A list of character mentions, ordered by appearance
+        :param mentions: A list of entity mentions, ordered by
+            appearance, each of the form (KEY MENTION).  KEY
+            determines the unicity of the entity.
         :param window: dynamic window, in tokens.
         :param overlap: window overlap
-        :param chapter_tokens: list of tokens for each chapter.  If
-            given, one graph will be extracted per chapter.
+        :param dynamic_blocks: boundaries of each dynamic block
+        :param co_occurrences_blocks: boundaries of each co-occurrences blocks
         """
-        assert window is None or chapter_tokens is None
+        assert co_occurrences_blocks is None or co_occurrences_blocks[1] == "tokens"
+        assert window is None or dynamic_blocks is None
         compute_polarity = not sentences is None and not sentences_polarities is None
         if not window is None:
@@ -318,104 +372,66 @@ class CoOccurrencesGraphExtractor(PipelineStep):
                     [elt for elt in ct if not elt is None],
                     sentences,
                     sentences_polarities,
+                    co_occurrences_blocks,
                 )
                 for ct in windowed(mentions, window, step=window - overlap)
             ]
-        assert not chapter_tokens is None
+        assert not dynamic_blocks is None
         graphs = []
-        chapters_mentions = mentions_for_chapters(chapter_tokens, mentions)
-        for chapter_i, (_, chapter_mentions) in enumerate(
-            zip(chapter_tokens, chapters_mentions)
-        ):
-            chapter_start_idx = sum(
-                [len(c) for i, c in enumerate(chapter_tokens) if i < chapter_i]
-            )
-            # make mentions coordinates chapter local
-            chapter_mentions = [
-                (c, m.shifted(-chapter_start_idx)) for c, m in chapter_mentions
-            ]
+        blocks_mentions = mentions_for_blocks(dynamic_blocks, mentions)
+        for dynamic_block, block_mentions in zip(dynamic_blocks[0], blocks_mentions):
+            block_start, block_end = dynamic_block
-            sent_start_idx, sent_end_idx = sent_indices_for_chapter(
-                chapter_tokens, chapter_i, sentences
-            )
-            chapter_sentences = sentences[sent_start_idx : sent_end_idx + 1]
+            sent_start, sent_end = sent_indices_for_block(dynamic_block, sentences)
+            block_sentences = sentences[sent_start : sent_end + 1]
-            chapter_sentences_polarities = None
+            block_sentences_polarities = None
             if compute_polarity:
                 assert not sentences_polarities is None
-                chapter_sentences_polarities = sentences_polarities[
-                    sent_start_idx : sent_end_idx + 1
+                block_sentences_polarities = sentences_polarities[
+                    sent_start : sent_end + 1
+                ]
+            if co_occurrences_blocks is None:
+                block_co_occ_bounds = None
+            else:
+                bounds = [
+                    (start, end)
+                    for start, end in co_occurrences_blocks[0]
+                    if start >= block_start and end <= block_end
                 ]
+                block_co_occ_bounds = (bounds, "tokens")
             graphs.append(
                 self._extract_graph(
-                    chapter_mentions,
-                    chapter_sentences,
-                    chapter_sentences_polarities,
+                    block_mentions,
+                    block_sentences,
+                    block_sentences_polarities,
+                    block_co_occ_bounds,
                 )
             )
         return graphs
-    def _extract_gephi_dynamic_graph(
-        self, mentions: List[Tuple[Character, NEREntity]], sentences: List[List[str]]
-    ) -> nx.Graph:
-        """
-        :param mentions: A list of character mentions, ordered by appearance
-        :param sentences:
-        """
-        # keep only longest name in graph node : possible only if it is unique
-        # TODO: might want to try and get shorter names if longest names aren't
-        #       unique
-        characters = set([e[0] for e in mentions])
-        G = nx.Graph()
-        character_to_last_appearance: Dict[Character, Optional[NEREntity]] = {
-            character: None for character in characters
-        }
-        for i, (character, mention) in enumerate(mentions):
-            if not character in characters:
-                continue
-            character_to_last_appearance[character] = mention
-            close_characters = [
-                c
-                for c, last_appearance in character_to_last_appearance.items()
-                if not last_appearance is None
-                and self._mentions_interact(mention, last_appearance, sentences)
-                and not c == character
-            ]
-            for close_character in close_characters:
-                if not G.has_edge(character, close_character):
-                    G.add_edge(character, close_character)
-                    G.edges[character, close_character]["start"] = i
-                    G.edges[character, close_character]["dweight"] = []
-                # add a new entry to the weight series according to networkx
-                # source code, each entry must be of the form
-                # [value, start, end]
-                weights = G.edges[character, close_character]["dweight"]
-                if len(weights) != 0:
-                    # end of last weight attribute
-                    weights[-1][-1] = i
-                # value, start and end of current weight attribute
-                last_weight_value = weights[-1][0] if len(weights) > 0 else 0
-                G.edges[character, close_character]["dweight"].append(
-                    [float(last_weight_value) + 1, i, len(mentions)]
-                )
-        return G
     def supported_langs(self) -> Union[Set[str], Literal["any"]]:
         return "any"
     def needs(self) -> Set[str]:
         needs = {"characters", "sentences"}
-        if self.dynamic_needs_chapter:
-            needs.add("chapter_tokens")
+        if self.need_dynamic_blocks:
+            needs.add("dynamic_blocks")
+            needs.add("char2token")
+        if self.need_co_occurrences_blocks:
+            needs.add("co_occurrences_blocks")
+            needs.add("char2token")
+        if len(self.additional_ner_classes) > 0:
+            needs.add("entities")
         return needs
     def production(self) -> Set[str]:
@@ -426,26 +442,49 @@ class CoOccurrencesGraphExtractor(PipelineStep):
 class ConversationalGraphExtractor(PipelineStep):
-    """A graph extractor using conversation between characters
+    """A graph extractor using conversation between characters or
+    mentions.
     .. note::
-        This is an early version, that only supports static graphs
-        for now.
+        Does not support dynamic networks yet.
     """
     def __init__(
-        self, conversation_dist: Union[int, Tuple[int, Literal["tokens", "sentences"]]]
+        self,
+        graph_type: Literal["conversation", "mention"],
+        conversation_dist: Optional[
+            Union[int, Tuple[int, Literal["tokens", "sentences"]]]
+        ] = None,
+        ignore_self_mention: bool = True,
     ):
+        """
+        :param graph_type: either 'conversation' or 'mention'.
+            'conversation' extracts an undirected graph with
+            interactions being extracted from the conversations
+            occurring between characters.  'mention' extracts a
+            directed graph where interactions are character mentions
+            of one another in quoted speech.
+        :param conversation_dist: must be supplied if `graph_type` is
+            'conversation'.  The distance between two quotation for
+            them to be considered as being interacting.
+        :param ignore_self_mention: if ``True``, self mentions are
+            ignore for ``graph_type=='mention'``
+        """
+        self.graph_type = graph_type
         if isinstance(conversation_dist, int):
             conversation_dist = (conversation_dist, "tokens")
         self.conversation_dist = conversation_dist
+        self.ignore_self_mention = ignore_self_mention
         super().__init__()
     def _quotes_interact(
         self, quote_1: Quote, quote_2: Quote, sentences: List[List[str]]
     ) -> bool:
+        assert not self.conversation_dist is None
         ordered = quote_2.start >= quote_1.end
         if self.conversation_dist[1] == "tokens":
             return (
@@ -467,14 +506,13 @@ class ConversationalGraphExtractor(PipelineStep):
         else:
             raise NotImplementedError
-    def __call__(
+    def _conversation_extract(
         self,
         sentences: List[List[str]],
         quotes: List[Quote],
         speakers: List[Optional[Character]],
         characters: Set[Character],
-        **kwargs,
-    ) -> Dict[str, Any]:
+    ) -> nx.Graph:
         G = nx.Graph()
         for character in characters:
             G.add_node(character)
@@ -504,6 +542,57 @@ class ConversationalGraphExtractor(PipelineStep):
                     G.add_edge(speaker_1, speaker_2, weight=0)
                 G.edges[speaker_1, speaker_2]["weight"] += 1
+        return G
+    def _mention_extract(
+        self,
+        quotes: List[Quote],
+        speakers: List[Optional[Character]],
+        characters: Set[Character],
+    ) -> nx.Graph:
+        G = nx.DiGraph()
+        for character in characters:
+            G.add_node(character)
+        for quote, speaker in zip(quotes, speakers):
+            # no speaker prediction: ignore
+            if speaker is None:
+                continue
+            # TODO: optim
+            # find characters mentioned in quote and add a directed
+            # edge speaker => character
+            for character in characters:
+                if character == speaker and self.ignore_self_mention:
+                    continue
+                for mention in character.mentions:
+                    if (
+                        mention.start_idx >= quote.start
+                        and mention.end_idx <= quote.end
+                    ):
+                        if not G.has_edge(speaker, character):
+                            G.add_edge(speaker, character, weight=0)
+                        G.edges[speaker, character]["weight"] += 1
+                        break
+        return G
+    def __call__(
+        self,
+        sentences: List[List[str]],
+        quotes: List[Quote],
+        speakers: List[Optional[Character]],
+        characters: Set[Character],
+        **kwargs,
+    ) -> Dict[str, Any]:
+        if self.graph_type == "conversation":
+            G = self._conversation_extract(sentences, quotes, speakers, characters)
+        elif self.graph_type == "mention":
+            G = self._mention_extract(quotes, speakers, characters)
+        else:
+            raise ValueError(f"unknown graph_type: {self.graph_type}")
         return {"character_network": G}
     def needs(self) -> Set[str]:

renard-pipeline 0.4.2__py3-none-any.whl → 0.6.0__py3-none-any.whl

Potentially problematic release.

renard-pipeline 0.4.2py3-none-any.whl → 0.6.0py3-none-any.whl