PyPI - renard-pipeline - Versions diffs - 0.3.1__tar.gz → 0.4.1__tar.gz - Mend

renard-pipeline 0.3.1tar.gz → 0.4.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of renard-pipeline might be problematic. Click here for more details.

Files changed (37) hide show

{renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/PKG-INFO RENAMED Viewed

@@ -1,7 +1,8 @@
 Metadata-Version: 2.1
 Name: renard-pipeline
-Version: 0.3.1
+Version: 0.4.1
 Summary: Relationships Extraction from NARrative Documents
+Home-page: https://github.com/CompNet/Renard
 License: GPL-3.0-only
 Author: Arthur Amalvy
 Author-email: arthur.amalvy@univ-avignon.fr
@@ -14,6 +15,7 @@ Classifier: Programming Language :: Python :: 3.10
 Provides-Extra: spacy
 Provides-Extra: stanza
 Requires-Dist: coreferee (>=1.4.0,<2.0.0) ; extra == "spacy"
+Requires-Dist: datasets (>=2.16.1,<3.0.0)
 Requires-Dist: grimbert (>=0.1.0,<0.2.0)
 Requires-Dist: matplotlib (>=3.5.3,<4.0.0)
 Requires-Dist: more-itertools (>=10.1.0,<11.0.0)
@@ -26,15 +28,19 @@ Requires-Dist: seqeval (==1.2.2)
 Requires-Dist: spacy (>=3.5.0,<4.0.0) ; extra == "spacy"
 Requires-Dist: spacy-transformers (>=1.2.1,<2.0.0) ; extra == "spacy"
 Requires-Dist: stanza (>=1.3.0,<2.0.0) ; extra == "stanza"
-Requires-Dist: tibert (>=0.2.4,<0.3.0)
+Requires-Dist: tibert (>=0.3.0,<0.4.0)
 Requires-Dist: torch (>=2.0.0,!=2.0.1)
 Requires-Dist: tqdm (>=4.62.3,<5.0.0)
 Requires-Dist: transformers (>=4.36.0,<5.0.0)
+Project-URL: Documentation, https://compnet.github.io/Renard/
+Project-URL: Repository, https://github.com/CompNet/Renard
 Description-Content-Type: text/markdown
 # Renard
-Relationships Extraction from NARrative Documents
+Renard (Relationships Extraction from NARrative Documents) is a library for creating and using custom character networks extraction pipelines. Renard can extract dynamic as well as static character networks.
+![Character network extracted from "Pride and Prejudice"](./docs/pp_white_bg.svg)
 # Installation
@@ -43,6 +49,8 @@ You can install the latest version using pip:
 > pip install renard-pipeline
+Currently, Renard supports Python 3.8, 3.9 and 3.10.
 # Documentation
@@ -53,7 +61,32 @@ If you need local documentation, it can be generated using `Sphinx`. From the `d
 # Tutorial
-`renard_tutorial.py` is a tutorial in the `jupytext` format. You can open it as a notebook in Jupyter Notebook (or export it as a notebook with `jupytext --to ipynb renard-tutorial.py`).
+Renard's central concept is the `Pipeline`.A `Pipeline` is a list of `PipelineStep` that are run sequentially in order to extract a character graph from a document. Here is a simple example:
+```python
+from renard.pipeline import Pipeline
+from renard.pipeline.tokenization import NLTKTokenizer
+from renard.pipeline.ner import NLTKNamedEntityRecognizer
+from renard.pipeline.character_unification import GraphRulesCharacterUnifier
+from renard.pipeline.graph_extraction import CoOccurrencesGraphExtractor
+with open("./my_doc.txt") as f:
+	text = f.read()
+pipeline = Pipeline(
+	[
+		NLTKTokenizer(),
+		NLTKNamedEntityRecognizer(),
+		GraphRulesCharacterUnifier(min_appearance=10),
+		CoOccurrencesGraphExtractor(co_occurrences_dist=25)
+	]
+)
+out = pipeline(text)
+```
+For more information, see `renard_tutorial.py`, which is a tutorial in the `jupytext` format. You can open it as a notebook in Jupyter Notebook (or export it as a notebook with `jupytext --to ipynb renard-tutorial.py`).
 # Running tests
@@ -64,3 +97,8 @@ If you need local documentation, it can be generated using `Sphinx`. From the `d
 Expensive tests are disabled by default. These can be run by setting the environment variable `RENARD_TEST_ALL` to `1`.
+# Contributing
+see [the "Contributing" section of the documentation](https://compnet.github.io/Renard/contributing.html).

renard_pipeline-0.4.1/README.md ADDED Viewed

@@ -0,0 +1,65 @@
+# Renard
+Renard (Relationships Extraction from NARrative Documents) is a library for creating and using custom character networks extraction pipelines. Renard can extract dynamic as well as static character networks.
+![Character network extracted from "Pride and Prejudice"](./docs/pp_white_bg.svg)
+# Installation
+You can install the latest version using pip:
+> pip install renard-pipeline
+Currently, Renard supports Python 3.8, 3.9 and 3.10.
+# Documentation
+Documentation, including installation instructions, can be found at https://compnet.github.io/Renard/
+If you need local documentation, it can be generated using `Sphinx`. From the `docs` directory, `make html` should create documentation under `docs/_build/html`.
+# Tutorial
+Renard's central concept is the `Pipeline`.A `Pipeline` is a list of `PipelineStep` that are run sequentially in order to extract a character graph from a document. Here is a simple example:
+```python
+from renard.pipeline import Pipeline
+from renard.pipeline.tokenization import NLTKTokenizer
+from renard.pipeline.ner import NLTKNamedEntityRecognizer
+from renard.pipeline.character_unification import GraphRulesCharacterUnifier
+from renard.pipeline.graph_extraction import CoOccurrencesGraphExtractor
+with open("./my_doc.txt") as f:
+	text = f.read()
+pipeline = Pipeline(
+	[
+		NLTKTokenizer(),
+		NLTKNamedEntityRecognizer(),
+		GraphRulesCharacterUnifier(min_appearance=10),
+		CoOccurrencesGraphExtractor(co_occurrences_dist=25)
+	]
+)
+out = pipeline(text)
+```
+For more information, see `renard_tutorial.py`, which is a tutorial in the `jupytext` format. You can open it as a notebook in Jupyter Notebook (or export it as a notebook with `jupytext --to ipynb renard-tutorial.py`).
+# Running tests
+`Renard` uses `pytest` for testing. To launch tests, use the following command :
+> poetry run python -m pytest tests
+Expensive tests are disabled by default. These can be run by setting the environment variable `RENARD_TEST_ALL` to `1`.
+# Contributing
+see [the "Contributing" section of the documentation](https://compnet.github.io/Renard/contributing.html).

{renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "renard-pipeline"
-version = "0.3.1"
+version = "0.4.1"
 description = "Relationships Extraction from NARrative Documents"
 authors = ["Arthur Amalvy <arthur.amalvy@univ-avignon.fr>"]
 license = "GPL-3.0-only"
@@ -8,6 +8,9 @@ readme = "README.md"
 packages = [
     { include = "renard" }
 ]
+homepage = "https://github.com/CompNet/Renard"
+repository = "https://github.com/CompNet/Renard"
+documentation = "https://compnet.github.io/Renard/"
 [tool.poetry.dependencies]
 # optional dependencies
@@ -28,8 +31,9 @@ matplotlib = "^3.5.3"
 seqeval = "1.2.2"
 pandas = "^2.0.0"
 pytest = "^7.2.1"
-tibert = "^0.2.4"
+tibert = "^0.3.0"
 grimbert = "^0.1.0"
+datasets = "^2.16.1"
 [tool.poetry.dev-dependencies]
 hypothesis = "^6.24.0"

renard_pipeline-0.4.1/renard/ner_utils.py ADDED Viewed

@@ -0,0 +1,342 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING, List, Optional, Union, Dict, Tuple
+import os, re
+import itertools as it
+import functools as ft
+from more_itertools import flatten
+import torch
+from torch.utils.data import Dataset
+from datasets import Dataset as HGDataset
+from datasets import Sequence, ClassLabel
+from transformers import (
+    AutoModelForTokenClassification,
+    AutoTokenizer,
+    PreTrainedTokenizerFast,
+    PreTrainedModel,
+    Trainer,
+    TrainingArguments,
+)
+from transformers.tokenization_utils_base import BatchEncoding
+if TYPE_CHECKING:
+    from renard.pipeline.ner import NEREntity
+class DataCollatorForTokenClassificationWithBatchEncoding:
+    """Same as ``transformers.DataCollatorForTokenClassification``,
+    except it correctly returns a ``BatchEncoding`` object with
+    correct ``encodings`` attribute.
+    Don't know why this is not the default ?
+    """
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizerFast,
+        pad_to_multiple_of: Optional[int] = None,
+    ) -> None:
+        self.tokenizer = tokenizer
+        self.pad_to_multiple_of = pad_to_multiple_of
+        self.pad_token_id = {"label": -100, "labels": -100}
+    def __call__(self, features: List[dict]) -> Union[dict, BatchEncoding]:
+        keys = features[0].keys()
+        sequence_len = max([len(f["input_ids"]) for f in features])
+        # We do the padding and collating manually instead of calling
+        # self.tokenizer.pad, because pad does not work on arbitrary
+        # features.
+        batch = BatchEncoding({})
+        for key in keys:
+            if self.tokenizer.padding_side == "right":
+                batch[key] = [
+                    f[key]
+                    + [self.pad_token_id.get(key, 0)] * (sequence_len - len(f[key]))
+                    for f in features
+                ]
+            else:
+                batch[key] = [
+                    [
+                        self.pad_token_id.get(key, 0) * (sequence_len - len(f[key]))
+                        + f[key]
+                        for f in features
+                    ]
+                ]
+        batch._encodings = [f.encodings[0] for f in features]
+        for k, v in batch.items():
+            batch[k] = torch.tensor(v)
+        return batch
+class NERDataset(Dataset):
+    """
+    :ivar _context_mask: for each element, a mask indicating which
+        tokens are part of the context (1 for context, 0 for text on
+        which to perform inference).  The mask allows to discard
+        predictions made for context at inference time, even though
+        the context can still be passed as input to the model.
+    """
+    def __init__(
+        self,
+        elements: List[List[str]],
+        tokenizer: PreTrainedTokenizerFast,
+        context_mask: Optional[List[List[int]]] = None,
+    ) -> None:
+        self.elements = elements
+        if context_mask:
+            assert all(
+                [len(cm) == len(elt) for elt, cm in zip(self.elements, context_mask)]
+            )
+        self._context_mask = context_mask or [[0] * len(elt) for elt in self.elements]
+        self.tokenizer = tokenizer
+    def __getitem__(self, index: Union[int, List[int]]) -> BatchEncoding:
+        element = self.elements[index]
+        batch = self.tokenizer(
+            element,
+            truncation=True,
+            max_length=512,  # TODO
+            is_split_into_words=True,
+        )
+        batch["context_mask"] = [0] * len(batch["input_ids"])
+        elt_context_mask = self._context_mask[index]
+        for i in range(len(element)):
+            w2t = batch.word_to_tokens(0, i)
+            mask_value = elt_context_mask[i]
+            tokens_mask = [mask_value] * (w2t.end - w2t.start)
+            batch["context_mask"][w2t.start : w2t.end] = tokens_mask
+        return batch
+    def __len__(self) -> int:
+        return len(self.elements)
+def ner_entities(
+    tokens: List[str], bio_tags: List[str], resolve_inconsistencies: bool = True
+) -> List[NEREntity]:
+    """Extract NER entities from a list of BIO tags
+    :param tokens: a list of tokens
+    :param bio_tags: a list of BIO tags.  In particular, BIO tags
+        should be in the CoNLL-2002 form (such as 'B-PER I-PER')
+    :return: A list of ner entities, in apparition order
+    """
+    from renard.pipeline.ner import NEREntity
+    assert len(tokens) == len(bio_tags)
+    entities = []
+    current_tag: Optional[str] = None
+    current_tag_start_idx: Optional[int] = None
+    for i, tag in enumerate(bio_tags):
+        if not current_tag is None and not tag.startswith("I-"):
+            assert not current_tag_start_idx is None
+            entities.append(
+                NEREntity(
+                    tokens[current_tag_start_idx:i],
+                    current_tag_start_idx,
+                    i,
+                    current_tag,
+                )
+            )
+            current_tag = None
+            current_tag_start_idx = None
+        if tag.startswith("B-"):
+            current_tag = tag[2:]
+            current_tag_start_idx = i
+        elif tag.startswith("I-"):
+            if current_tag is None and resolve_inconsistencies:
+                current_tag = tag[2:]
+                current_tag_start_idx = i
+                continue
+    if not current_tag is None:
+        assert not current_tag_start_idx is None
+        entities.append(
+            NEREntity(
+                tokens[current_tag_start_idx : len(tokens)],
+                current_tag_start_idx,
+                len(bio_tags),
+                current_tag,
+            )
+        )
+    return entities
+def load_conll2002_bio(
+    path: str,
+    tag_conversion_map: Optional[Dict[str, str]] = None,
+    separator: str = "\t",
+    **kwargs,
+) -> Tuple[List[List[str]], List[str], List[NEREntity]]:
+    """Load a file under CoNLL2022 BIO format.  Sentences are expected
+    to be separated by end of lines.  Tags should be in the CoNLL-2002
+    format (such as 'B-PER I-PER') - If this is not the case, see the
+    ``tag_conversion_map`` argument.
+    :param path: path to the CoNLL-2002 formatted file
+    :param separator: separator between token and BIO tags
+    :param tag_conversion_map: conversion map for tags found in the
+        input file.  Example : ``{'B': 'B-PER', 'I': 'I-PER'}``
+    :param kwargs: additional kwargs for ``open`` (such as
+        ``encoding`` or ``newline``).
+    :return: ``(sentences, tokens, entities)``
+    """
+    tag_conversion_map = tag_conversion_map or {}
+    with open(os.path.expanduser(path), **kwargs) as f:
+        raw_data = f.read()
+    sents = []
+    sent_tokens = []
+    tags = []
+    for line in raw_data.split("\n"):
+        line = line.strip("\n")
+        if re.fullmatch(r"\s*", line):
+            if len(sent_tokens) == 0:
+                continue
+            sents.append(sent_tokens)
+            sent_tokens = []
+            continue
+        token, tag = line.split(separator)
+        sent_tokens.append(token)
+        tags.append(tag_conversion_map.get(tag, tag))
+    tokens = list(flatten(sents))
+    entities = ner_entities(tokens, tags)
+    return sents, list(flatten(sents)), entities
+def hgdataset_from_conll2002(
+    path: str,
+    tag_conversion_map: Optional[Dict[str, str]] = None,
+    separator: str = "\t",
+    **kwargs,
+) -> HGDataset:
+    """Load a CoNLL-2002 file as a Huggingface Dataset.
+    :param path: passed to :func:`.load_conll2002_bio`
+    :param tag_conversion_map: passed to :func:`load_conll2002_bio`
+    :param separator: passed to :func:`load_conll2002_bio`
+    :param kwargs: passed to :func:`load_conll2002_bio`
+    :return: a :class:`datasets.Dataset` with features 'tokens' and 'labels'.
+    """
+    sentences, tokens, entities = load_conll2002_bio(
+        path, tag_conversion_map, separator, **kwargs
+    )
+    # convert entities to labels
+    tags = ["O"] * len(tokens)
+    for entity in entities:
+        entity_len = entity.end_idx - entity.start_idx
+        tags[entity.start_idx : entity.end_idx] = [f"B-{entity.tag}"] + [
+            f"I-{entity.tag}"
+        ] * (entity_len - 1)
+    # cut into sentences
+    sent_ends = list(it.accumulate([len(s) for s in sentences]))
+    sent_starts = [0] + sent_ends[:-1]
+    sent_tags = [
+        tags[sent_start:sent_end]
+        for sent_start, sent_end in zip(sent_starts, sent_ends)
+    ]
+    dataset = HGDataset.from_dict({"tokens": sentences, "labels": sent_tags})
+    dataset = dataset.cast_column(
+        "labels", Sequence(ClassLabel(names=sorted(set(tags))))
+    )
+    return dataset
+def _tokenize_and_align_labels(
+    examples, tokenizer: PreTrainedTokenizerFast, label_all_tokens: bool = True
+):
+    """Adapted from https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/token_classification.ipynb#scrollTo=vc0BSBLIIrJQ
+    :param examples: an object with keys 'tokens' and 'labels'
+    """
+    tokenized_inputs = tokenizer(
+        examples["tokens"], truncation=True, is_split_into_words=True
+    )
+    labels = []
+    for i, label in enumerate(examples[f"labels"]):
+        word_ids = tokenized_inputs.word_ids(batch_index=i)
+        previous_word_idx = None
+        label_ids = []
+        for word_idx in word_ids:
+            # Special tokens have a word id that is None. We set the
+            # label to -100 so they are automatically ignored in the
+            # loss function.
+            if word_idx is None:
+                label_ids.append(-100)
+            # We set the label for the first token of each word.
+            elif word_idx != previous_word_idx:
+                label_ids.append(label[word_idx])
+            # For the other tokens in a word, we set the label to
+            # either the current label or -100, depending on the
+            # label_all_tokens flag.
+            else:
+                label_ids.append(label[word_idx] if label_all_tokens else -100)
+            previous_word_idx = word_idx
+        labels.append(label_ids)
+    tokenized_inputs["labels"] = labels
+    return tokenized_inputs
+def train_ner_model(
+    hg_id: str,
+    dataset: HGDataset,
+    targs: TrainingArguments,
+) -> PreTrainedModel:
+    from transformers import DataCollatorForTokenClassification
+    # BERT tokenizer splits tokens into subtokens. The
+    # tokenize_and_align_labels function correctly aligns labels and
+    # subtokens.
+    tokenizer = AutoTokenizer.from_pretrained(hg_id)
+    dataset = dataset.map(
+        ft.partial(_tokenize_and_align_labels, tokenizer=tokenizer), batched=True
+    )
+    dataset = dataset.train_test_split(test_size=0.1)
+    label_lst = dataset["train"].features["labels"].feature.names
+    model = AutoModelForTokenClassification.from_pretrained(
+        hg_id,
+        num_labels=len(label_lst),
+        id2label={i: label for i, label in enumerate(label_lst)},
+        label2id={label: i for i, label in enumerate(label_lst)},
+    )
+    trainer = Trainer(
+        model,
+        targs,
+        train_dataset=dataset["train"],
+        eval_dataset=dataset["test"],
+        # data_collator=DataCollatorForTokenClassificationWithBatchEncoding(tokenizer),
+        data_collator=DataCollatorForTokenClassification(tokenizer),
+        tokenizer=tokenizer,
+    )
+    trainer.train()
+    return model

{renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/character_unification.py RENAMED Viewed

@@ -54,8 +54,8 @@ def _assign_coreference_mentions(
     :param corefs:
     """
-    char_mentions: Dict[Character, List[Mention]] = {
-        character: character.mentions for character in characters
+    char_mentions: Dict[Character, Set[Mention]] = {
+        character: set(character.mentions) for character in characters
     }
     # we assign each chain to the character with highest name
@@ -80,12 +80,12 @@ def _assign_coreference_mentions(
         # assign the chain to the character with the most occurences
         for mention in chain:
-            # TODO: complexity
             if not mention in char_mentions[best_character]:
-                char_mentions[best_character].append(mention)
+                char_mentions[best_character].add(mention)
     return [
-        Character(c.names, mentions, c.gender) for c, mentions in char_mentions.items()
+        Character(c.names, sorted(mentions, key=lambda m: m.start_idx), c.gender)
+        for c, mentions in char_mentions.items()
     ]
@@ -209,7 +209,6 @@ class GraphRulesCharacterUnifier(PipelineStep):
         # * link nodes based on several rules
         for name1, name2 in combinations(G.nodes(), 2):
             # is one name a known hypocorism of the other ? (also
             # checks if both names are the same)
             if self.hypocorism_gazetteer.are_related(name1, name2):
@@ -263,7 +262,6 @@ class GraphRulesCharacterUnifier(PipelineStep):
                 pass
         for name1, name2 in combinations(G.nodes(), 2):
             # check if characters have the same last name but a
             # different first name.
             human_name1 = HumanName(name1, constants=hname_constants)
@@ -333,10 +331,11 @@ class GraphRulesCharacterUnifier(PipelineStep):
         self, name1: str, name2: str, hname_constants: Constants
     ) -> bool:
         """Check if two names are related after removing their titles"""
-        local_constants = copy.deepcopy(hname_constants)
-        local_constants.string_format = "{first} {middle} {last}"
-        raw_name1 = HumanName(name1, constants=local_constants).full_name
-        raw_name2 = HumanName(name2, constants=local_constants).full_name
+        old_string_format = hname_constants.string_format
+        hname_constants.string_format = "{first} {middle} {last}"
+        raw_name1 = HumanName(name1, constants=hname_constants).full_name
+        raw_name2 = HumanName(name2, constants=hname_constants).full_name
+        hname_constants.string_format = old_string_format
         if raw_name1 == "" or raw_name2 == "":
             return False

{renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/characters_extraction.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import renard.pipeline.character_unification as cu
 print(
-    "[warning] the characters_extraction module is deprecated. Use character_unfication instead."
+    "[warning] the characters_extraction module is deprecated. Use character_unification instead."
 )
 Character = cu.Character

renard-pipeline 0.3.1__tar.gz → 0.4.1__tar.gz

Potentially problematic release.

renard-pipeline 0.3.1tar.gz → 0.4.1tar.gz