renard-pipeline 0.3.1__tar.gz → 0.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of renard-pipeline might be problematic. Click here for more details.

Files changed (37) hide show
  1. {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/PKG-INFO +42 -4
  2. renard_pipeline-0.4.1/README.md +65 -0
  3. {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/pyproject.toml +6 -2
  4. renard_pipeline-0.4.1/renard/ner_utils.py +342 -0
  5. {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/character_unification.py +10 -11
  6. {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/characters_extraction.py +1 -1
  7. {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/core.py +51 -34
  8. {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/graph_extraction.py +7 -10
  9. {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/ner.py +79 -58
  10. {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/stanford_corenlp.py +1 -1
  11. renard_pipeline-0.4.1/renard/py.typed +0 -0
  12. {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/utils.py +1 -52
  13. renard_pipeline-0.3.1/README.md +0 -31
  14. renard_pipeline-0.3.1/renard/ner_utils.py +0 -80
  15. {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/LICENSE +0 -0
  16. {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/gender.py +0 -0
  17. {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/graph_utils.py +0 -0
  18. {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/nltk_utils.py +0 -0
  19. {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/__init__.py +0 -0
  20. {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/corefs/__init__.py +0 -0
  21. {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/corefs/corefs.py +0 -0
  22. {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/preconfigured.py +0 -0
  23. {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/preprocessing.py +0 -0
  24. {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/progress.py +0 -0
  25. {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/quote_detection.py +0 -0
  26. {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/sentiment_analysis.py +0 -0
  27. {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/speaker_attribution.py +0 -0
  28. {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/tokenization.py +0 -0
  29. {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/plot_utils.py +0 -0
  30. {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/resources/hypocorisms/__init__.py +0 -0
  31. {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/resources/hypocorisms/datas/License.txt +0 -0
  32. {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/resources/hypocorisms/datas/hypocorisms.csv +0 -0
  33. {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/resources/hypocorisms/hypocorisms.py +0 -0
  34. {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/resources/pronouns/__init__.py +0 -0
  35. {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/resources/pronouns/pronouns.py +0 -0
  36. {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/resources/titles/__init__.py +0 -0
  37. {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/resources/titles/titles.py +0 -0
@@ -1,7 +1,8 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: renard-pipeline
3
- Version: 0.3.1
3
+ Version: 0.4.1
4
4
  Summary: Relationships Extraction from NARrative Documents
5
+ Home-page: https://github.com/CompNet/Renard
5
6
  License: GPL-3.0-only
6
7
  Author: Arthur Amalvy
7
8
  Author-email: arthur.amalvy@univ-avignon.fr
@@ -14,6 +15,7 @@ Classifier: Programming Language :: Python :: 3.10
14
15
  Provides-Extra: spacy
15
16
  Provides-Extra: stanza
16
17
  Requires-Dist: coreferee (>=1.4.0,<2.0.0) ; extra == "spacy"
18
+ Requires-Dist: datasets (>=2.16.1,<3.0.0)
17
19
  Requires-Dist: grimbert (>=0.1.0,<0.2.0)
18
20
  Requires-Dist: matplotlib (>=3.5.3,<4.0.0)
19
21
  Requires-Dist: more-itertools (>=10.1.0,<11.0.0)
@@ -26,15 +28,19 @@ Requires-Dist: seqeval (==1.2.2)
26
28
  Requires-Dist: spacy (>=3.5.0,<4.0.0) ; extra == "spacy"
27
29
  Requires-Dist: spacy-transformers (>=1.2.1,<2.0.0) ; extra == "spacy"
28
30
  Requires-Dist: stanza (>=1.3.0,<2.0.0) ; extra == "stanza"
29
- Requires-Dist: tibert (>=0.2.4,<0.3.0)
31
+ Requires-Dist: tibert (>=0.3.0,<0.4.0)
30
32
  Requires-Dist: torch (>=2.0.0,!=2.0.1)
31
33
  Requires-Dist: tqdm (>=4.62.3,<5.0.0)
32
34
  Requires-Dist: transformers (>=4.36.0,<5.0.0)
35
+ Project-URL: Documentation, https://compnet.github.io/Renard/
36
+ Project-URL: Repository, https://github.com/CompNet/Renard
33
37
  Description-Content-Type: text/markdown
34
38
 
35
39
  # Renard
36
40
 
37
- Relationships Extraction from NARrative Documents
41
+ Renard (Relationships Extraction from NARrative Documents) is a library for creating and using custom character networks extraction pipelines. Renard can extract dynamic as well as static character networks.
42
+
43
+ ![Character network extracted from "Pride and Prejudice"](./docs/pp_white_bg.svg)
38
44
 
39
45
 
40
46
  # Installation
@@ -43,6 +49,8 @@ You can install the latest version using pip:
43
49
 
44
50
  > pip install renard-pipeline
45
51
 
52
+ Currently, Renard supports Python 3.8, 3.9 and 3.10.
53
+
46
54
 
47
55
  # Documentation
48
56
 
@@ -53,7 +61,32 @@ If you need local documentation, it can be generated using `Sphinx`. From the `d
53
61
 
54
62
  # Tutorial
55
63
 
56
- `renard_tutorial.py` is a tutorial in the `jupytext` format. You can open it as a notebook in Jupyter Notebook (or export it as a notebook with `jupytext --to ipynb renard-tutorial.py`).
64
+ Renard's central concept is the `Pipeline`.A `Pipeline` is a list of `PipelineStep` that are run sequentially in order to extract a character graph from a document. Here is a simple example:
65
+
66
+ ```python
67
+ from renard.pipeline import Pipeline
68
+ from renard.pipeline.tokenization import NLTKTokenizer
69
+ from renard.pipeline.ner import NLTKNamedEntityRecognizer
70
+ from renard.pipeline.character_unification import GraphRulesCharacterUnifier
71
+ from renard.pipeline.graph_extraction import CoOccurrencesGraphExtractor
72
+
73
+ with open("./my_doc.txt") as f:
74
+ text = f.read()
75
+
76
+ pipeline = Pipeline(
77
+ [
78
+ NLTKTokenizer(),
79
+ NLTKNamedEntityRecognizer(),
80
+ GraphRulesCharacterUnifier(min_appearance=10),
81
+ CoOccurrencesGraphExtractor(co_occurrences_dist=25)
82
+ ]
83
+ )
84
+
85
+ out = pipeline(text)
86
+ ```
87
+
88
+ For more information, see `renard_tutorial.py`, which is a tutorial in the `jupytext` format. You can open it as a notebook in Jupyter Notebook (or export it as a notebook with `jupytext --to ipynb renard-tutorial.py`).
89
+
57
90
 
58
91
 
59
92
  # Running tests
@@ -64,3 +97,8 @@ If you need local documentation, it can be generated using `Sphinx`. From the `d
64
97
 
65
98
  Expensive tests are disabled by default. These can be run by setting the environment variable `RENARD_TEST_ALL` to `1`.
66
99
 
100
+
101
+ # Contributing
102
+
103
+ see [the "Contributing" section of the documentation](https://compnet.github.io/Renard/contributing.html).
104
+
@@ -0,0 +1,65 @@
1
+ # Renard
2
+
3
+ Renard (Relationships Extraction from NARrative Documents) is a library for creating and using custom character networks extraction pipelines. Renard can extract dynamic as well as static character networks.
4
+
5
+ ![Character network extracted from "Pride and Prejudice"](./docs/pp_white_bg.svg)
6
+
7
+
8
+ # Installation
9
+
10
+ You can install the latest version using pip:
11
+
12
+ > pip install renard-pipeline
13
+
14
+ Currently, Renard supports Python 3.8, 3.9 and 3.10.
15
+
16
+
17
+ # Documentation
18
+
19
+ Documentation, including installation instructions, can be found at https://compnet.github.io/Renard/
20
+
21
+ If you need local documentation, it can be generated using `Sphinx`. From the `docs` directory, `make html` should create documentation under `docs/_build/html`.
22
+
23
+
24
+ # Tutorial
25
+
26
+ Renard's central concept is the `Pipeline`.A `Pipeline` is a list of `PipelineStep` that are run sequentially in order to extract a character graph from a document. Here is a simple example:
27
+
28
+ ```python
29
+ from renard.pipeline import Pipeline
30
+ from renard.pipeline.tokenization import NLTKTokenizer
31
+ from renard.pipeline.ner import NLTKNamedEntityRecognizer
32
+ from renard.pipeline.character_unification import GraphRulesCharacterUnifier
33
+ from renard.pipeline.graph_extraction import CoOccurrencesGraphExtractor
34
+
35
+ with open("./my_doc.txt") as f:
36
+ text = f.read()
37
+
38
+ pipeline = Pipeline(
39
+ [
40
+ NLTKTokenizer(),
41
+ NLTKNamedEntityRecognizer(),
42
+ GraphRulesCharacterUnifier(min_appearance=10),
43
+ CoOccurrencesGraphExtractor(co_occurrences_dist=25)
44
+ ]
45
+ )
46
+
47
+ out = pipeline(text)
48
+ ```
49
+
50
+ For more information, see `renard_tutorial.py`, which is a tutorial in the `jupytext` format. You can open it as a notebook in Jupyter Notebook (or export it as a notebook with `jupytext --to ipynb renard-tutorial.py`).
51
+
52
+
53
+
54
+ # Running tests
55
+
56
+ `Renard` uses `pytest` for testing. To launch tests, use the following command :
57
+
58
+ > poetry run python -m pytest tests
59
+
60
+ Expensive tests are disabled by default. These can be run by setting the environment variable `RENARD_TEST_ALL` to `1`.
61
+
62
+
63
+ # Contributing
64
+
65
+ see [the "Contributing" section of the documentation](https://compnet.github.io/Renard/contributing.html).
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "renard-pipeline"
3
- version = "0.3.1"
3
+ version = "0.4.1"
4
4
  description = "Relationships Extraction from NARrative Documents"
5
5
  authors = ["Arthur Amalvy <arthur.amalvy@univ-avignon.fr>"]
6
6
  license = "GPL-3.0-only"
@@ -8,6 +8,9 @@ readme = "README.md"
8
8
  packages = [
9
9
  { include = "renard" }
10
10
  ]
11
+ homepage = "https://github.com/CompNet/Renard"
12
+ repository = "https://github.com/CompNet/Renard"
13
+ documentation = "https://compnet.github.io/Renard/"
11
14
 
12
15
  [tool.poetry.dependencies]
13
16
  # optional dependencies
@@ -28,8 +31,9 @@ matplotlib = "^3.5.3"
28
31
  seqeval = "1.2.2"
29
32
  pandas = "^2.0.0"
30
33
  pytest = "^7.2.1"
31
- tibert = "^0.2.4"
34
+ tibert = "^0.3.0"
32
35
  grimbert = "^0.1.0"
36
+ datasets = "^2.16.1"
33
37
 
34
38
  [tool.poetry.dev-dependencies]
35
39
  hypothesis = "^6.24.0"
@@ -0,0 +1,342 @@
1
+ from __future__ import annotations
2
+ from typing import TYPE_CHECKING, List, Optional, Union, Dict, Tuple
3
+ import os, re
4
+ import itertools as it
5
+ import functools as ft
6
+ from more_itertools import flatten
7
+ import torch
8
+ from torch.utils.data import Dataset
9
+ from datasets import Dataset as HGDataset
10
+ from datasets import Sequence, ClassLabel
11
+ from transformers import (
12
+ AutoModelForTokenClassification,
13
+ AutoTokenizer,
14
+ PreTrainedTokenizerFast,
15
+ PreTrainedModel,
16
+ Trainer,
17
+ TrainingArguments,
18
+ )
19
+ from transformers.tokenization_utils_base import BatchEncoding
20
+
21
+ if TYPE_CHECKING:
22
+ from renard.pipeline.ner import NEREntity
23
+
24
+
25
+ class DataCollatorForTokenClassificationWithBatchEncoding:
26
+ """Same as ``transformers.DataCollatorForTokenClassification``,
27
+ except it correctly returns a ``BatchEncoding`` object with
28
+ correct ``encodings`` attribute.
29
+
30
+ Don't know why this is not the default ?
31
+ """
32
+
33
+ def __init__(
34
+ self,
35
+ tokenizer: PreTrainedTokenizerFast,
36
+ pad_to_multiple_of: Optional[int] = None,
37
+ ) -> None:
38
+ self.tokenizer = tokenizer
39
+ self.pad_to_multiple_of = pad_to_multiple_of
40
+ self.pad_token_id = {"label": -100, "labels": -100}
41
+
42
+ def __call__(self, features: List[dict]) -> Union[dict, BatchEncoding]:
43
+ keys = features[0].keys()
44
+ sequence_len = max([len(f["input_ids"]) for f in features])
45
+
46
+ # We do the padding and collating manually instead of calling
47
+ # self.tokenizer.pad, because pad does not work on arbitrary
48
+ # features.
49
+ batch = BatchEncoding({})
50
+ for key in keys:
51
+ if self.tokenizer.padding_side == "right":
52
+ batch[key] = [
53
+ f[key]
54
+ + [self.pad_token_id.get(key, 0)] * (sequence_len - len(f[key]))
55
+ for f in features
56
+ ]
57
+ else:
58
+ batch[key] = [
59
+ [
60
+ self.pad_token_id.get(key, 0) * (sequence_len - len(f[key]))
61
+ + f[key]
62
+ for f in features
63
+ ]
64
+ ]
65
+
66
+ batch._encodings = [f.encodings[0] for f in features]
67
+
68
+ for k, v in batch.items():
69
+ batch[k] = torch.tensor(v)
70
+
71
+ return batch
72
+
73
+
74
+ class NERDataset(Dataset):
75
+ """
76
+ :ivar _context_mask: for each element, a mask indicating which
77
+ tokens are part of the context (1 for context, 0 for text on
78
+ which to perform inference). The mask allows to discard
79
+ predictions made for context at inference time, even though
80
+ the context can still be passed as input to the model.
81
+ """
82
+
83
+ def __init__(
84
+ self,
85
+ elements: List[List[str]],
86
+ tokenizer: PreTrainedTokenizerFast,
87
+ context_mask: Optional[List[List[int]]] = None,
88
+ ) -> None:
89
+ self.elements = elements
90
+
91
+ if context_mask:
92
+ assert all(
93
+ [len(cm) == len(elt) for elt, cm in zip(self.elements, context_mask)]
94
+ )
95
+ self._context_mask = context_mask or [[0] * len(elt) for elt in self.elements]
96
+
97
+ self.tokenizer = tokenizer
98
+
99
+ def __getitem__(self, index: Union[int, List[int]]) -> BatchEncoding:
100
+ element = self.elements[index]
101
+
102
+ batch = self.tokenizer(
103
+ element,
104
+ truncation=True,
105
+ max_length=512, # TODO
106
+ is_split_into_words=True,
107
+ )
108
+
109
+ batch["context_mask"] = [0] * len(batch["input_ids"])
110
+ elt_context_mask = self._context_mask[index]
111
+ for i in range(len(element)):
112
+ w2t = batch.word_to_tokens(0, i)
113
+ mask_value = elt_context_mask[i]
114
+ tokens_mask = [mask_value] * (w2t.end - w2t.start)
115
+ batch["context_mask"][w2t.start : w2t.end] = tokens_mask
116
+
117
+ return batch
118
+
119
+ def __len__(self) -> int:
120
+ return len(self.elements)
121
+
122
+
123
+ def ner_entities(
124
+ tokens: List[str], bio_tags: List[str], resolve_inconsistencies: bool = True
125
+ ) -> List[NEREntity]:
126
+ """Extract NER entities from a list of BIO tags
127
+
128
+ :param tokens: a list of tokens
129
+ :param bio_tags: a list of BIO tags. In particular, BIO tags
130
+ should be in the CoNLL-2002 form (such as 'B-PER I-PER')
131
+
132
+ :return: A list of ner entities, in apparition order
133
+ """
134
+ from renard.pipeline.ner import NEREntity
135
+
136
+ assert len(tokens) == len(bio_tags)
137
+
138
+ entities = []
139
+ current_tag: Optional[str] = None
140
+ current_tag_start_idx: Optional[int] = None
141
+
142
+ for i, tag in enumerate(bio_tags):
143
+ if not current_tag is None and not tag.startswith("I-"):
144
+ assert not current_tag_start_idx is None
145
+ entities.append(
146
+ NEREntity(
147
+ tokens[current_tag_start_idx:i],
148
+ current_tag_start_idx,
149
+ i,
150
+ current_tag,
151
+ )
152
+ )
153
+ current_tag = None
154
+ current_tag_start_idx = None
155
+
156
+ if tag.startswith("B-"):
157
+ current_tag = tag[2:]
158
+ current_tag_start_idx = i
159
+
160
+ elif tag.startswith("I-"):
161
+ if current_tag is None and resolve_inconsistencies:
162
+ current_tag = tag[2:]
163
+ current_tag_start_idx = i
164
+ continue
165
+
166
+ if not current_tag is None:
167
+ assert not current_tag_start_idx is None
168
+ entities.append(
169
+ NEREntity(
170
+ tokens[current_tag_start_idx : len(tokens)],
171
+ current_tag_start_idx,
172
+ len(bio_tags),
173
+ current_tag,
174
+ )
175
+ )
176
+
177
+ return entities
178
+
179
+
180
+ def load_conll2002_bio(
181
+ path: str,
182
+ tag_conversion_map: Optional[Dict[str, str]] = None,
183
+ separator: str = "\t",
184
+ **kwargs,
185
+ ) -> Tuple[List[List[str]], List[str], List[NEREntity]]:
186
+ """Load a file under CoNLL2022 BIO format. Sentences are expected
187
+ to be separated by end of lines. Tags should be in the CoNLL-2002
188
+ format (such as 'B-PER I-PER') - If this is not the case, see the
189
+ ``tag_conversion_map`` argument.
190
+
191
+ :param path: path to the CoNLL-2002 formatted file
192
+ :param separator: separator between token and BIO tags
193
+ :param tag_conversion_map: conversion map for tags found in the
194
+ input file. Example : ``{'B': 'B-PER', 'I': 'I-PER'}``
195
+ :param kwargs: additional kwargs for ``open`` (such as
196
+ ``encoding`` or ``newline``).
197
+
198
+ :return: ``(sentences, tokens, entities)``
199
+ """
200
+ tag_conversion_map = tag_conversion_map or {}
201
+
202
+ with open(os.path.expanduser(path), **kwargs) as f:
203
+ raw_data = f.read()
204
+
205
+ sents = []
206
+ sent_tokens = []
207
+ tags = []
208
+ for line in raw_data.split("\n"):
209
+ line = line.strip("\n")
210
+ if re.fullmatch(r"\s*", line):
211
+ if len(sent_tokens) == 0:
212
+ continue
213
+ sents.append(sent_tokens)
214
+ sent_tokens = []
215
+ continue
216
+ token, tag = line.split(separator)
217
+ sent_tokens.append(token)
218
+ tags.append(tag_conversion_map.get(tag, tag))
219
+
220
+ tokens = list(flatten(sents))
221
+ entities = ner_entities(tokens, tags)
222
+
223
+ return sents, list(flatten(sents)), entities
224
+
225
+
226
+ def hgdataset_from_conll2002(
227
+ path: str,
228
+ tag_conversion_map: Optional[Dict[str, str]] = None,
229
+ separator: str = "\t",
230
+ **kwargs,
231
+ ) -> HGDataset:
232
+ """Load a CoNLL-2002 file as a Huggingface Dataset.
233
+
234
+ :param path: passed to :func:`.load_conll2002_bio`
235
+ :param tag_conversion_map: passed to :func:`load_conll2002_bio`
236
+ :param separator: passed to :func:`load_conll2002_bio`
237
+ :param kwargs: passed to :func:`load_conll2002_bio`
238
+
239
+ :return: a :class:`datasets.Dataset` with features 'tokens' and 'labels'.
240
+ """
241
+ sentences, tokens, entities = load_conll2002_bio(
242
+ path, tag_conversion_map, separator, **kwargs
243
+ )
244
+
245
+ # convert entities to labels
246
+ tags = ["O"] * len(tokens)
247
+ for entity in entities:
248
+ entity_len = entity.end_idx - entity.start_idx
249
+ tags[entity.start_idx : entity.end_idx] = [f"B-{entity.tag}"] + [
250
+ f"I-{entity.tag}"
251
+ ] * (entity_len - 1)
252
+
253
+ # cut into sentences
254
+ sent_ends = list(it.accumulate([len(s) for s in sentences]))
255
+ sent_starts = [0] + sent_ends[:-1]
256
+ sent_tags = [
257
+ tags[sent_start:sent_end]
258
+ for sent_start, sent_end in zip(sent_starts, sent_ends)
259
+ ]
260
+
261
+ dataset = HGDataset.from_dict({"tokens": sentences, "labels": sent_tags})
262
+ dataset = dataset.cast_column(
263
+ "labels", Sequence(ClassLabel(names=sorted(set(tags))))
264
+ )
265
+ return dataset
266
+
267
+
268
+ def _tokenize_and_align_labels(
269
+ examples, tokenizer: PreTrainedTokenizerFast, label_all_tokens: bool = True
270
+ ):
271
+ """Adapted from https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/token_classification.ipynb#scrollTo=vc0BSBLIIrJQ
272
+
273
+ :param examples: an object with keys 'tokens' and 'labels'
274
+ """
275
+ tokenized_inputs = tokenizer(
276
+ examples["tokens"], truncation=True, is_split_into_words=True
277
+ )
278
+
279
+ labels = []
280
+ for i, label in enumerate(examples[f"labels"]):
281
+ word_ids = tokenized_inputs.word_ids(batch_index=i)
282
+ previous_word_idx = None
283
+ label_ids = []
284
+ for word_idx in word_ids:
285
+ # Special tokens have a word id that is None. We set the
286
+ # label to -100 so they are automatically ignored in the
287
+ # loss function.
288
+ if word_idx is None:
289
+ label_ids.append(-100)
290
+ # We set the label for the first token of each word.
291
+ elif word_idx != previous_word_idx:
292
+ label_ids.append(label[word_idx])
293
+ # For the other tokens in a word, we set the label to
294
+ # either the current label or -100, depending on the
295
+ # label_all_tokens flag.
296
+ else:
297
+ label_ids.append(label[word_idx] if label_all_tokens else -100)
298
+ previous_word_idx = word_idx
299
+
300
+ labels.append(label_ids)
301
+
302
+ tokenized_inputs["labels"] = labels
303
+
304
+ return tokenized_inputs
305
+
306
+
307
+ def train_ner_model(
308
+ hg_id: str,
309
+ dataset: HGDataset,
310
+ targs: TrainingArguments,
311
+ ) -> PreTrainedModel:
312
+ from transformers import DataCollatorForTokenClassification
313
+
314
+ # BERT tokenizer splits tokens into subtokens. The
315
+ # tokenize_and_align_labels function correctly aligns labels and
316
+ # subtokens.
317
+ tokenizer = AutoTokenizer.from_pretrained(hg_id)
318
+ dataset = dataset.map(
319
+ ft.partial(_tokenize_and_align_labels, tokenizer=tokenizer), batched=True
320
+ )
321
+ dataset = dataset.train_test_split(test_size=0.1)
322
+
323
+ label_lst = dataset["train"].features["labels"].feature.names
324
+ model = AutoModelForTokenClassification.from_pretrained(
325
+ hg_id,
326
+ num_labels=len(label_lst),
327
+ id2label={i: label for i, label in enumerate(label_lst)},
328
+ label2id={label: i for i, label in enumerate(label_lst)},
329
+ )
330
+
331
+ trainer = Trainer(
332
+ model,
333
+ targs,
334
+ train_dataset=dataset["train"],
335
+ eval_dataset=dataset["test"],
336
+ # data_collator=DataCollatorForTokenClassificationWithBatchEncoding(tokenizer),
337
+ data_collator=DataCollatorForTokenClassification(tokenizer),
338
+ tokenizer=tokenizer,
339
+ )
340
+ trainer.train()
341
+
342
+ return model
@@ -54,8 +54,8 @@ def _assign_coreference_mentions(
54
54
  :param corefs:
55
55
  """
56
56
 
57
- char_mentions: Dict[Character, List[Mention]] = {
58
- character: character.mentions for character in characters
57
+ char_mentions: Dict[Character, Set[Mention]] = {
58
+ character: set(character.mentions) for character in characters
59
59
  }
60
60
 
61
61
  # we assign each chain to the character with highest name
@@ -80,12 +80,12 @@ def _assign_coreference_mentions(
80
80
 
81
81
  # assign the chain to the character with the most occurences
82
82
  for mention in chain:
83
- # TODO: complexity
84
83
  if not mention in char_mentions[best_character]:
85
- char_mentions[best_character].append(mention)
84
+ char_mentions[best_character].add(mention)
86
85
 
87
86
  return [
88
- Character(c.names, mentions, c.gender) for c, mentions in char_mentions.items()
87
+ Character(c.names, sorted(mentions, key=lambda m: m.start_idx), c.gender)
88
+ for c, mentions in char_mentions.items()
89
89
  ]
90
90
 
91
91
 
@@ -209,7 +209,6 @@ class GraphRulesCharacterUnifier(PipelineStep):
209
209
 
210
210
  # * link nodes based on several rules
211
211
  for name1, name2 in combinations(G.nodes(), 2):
212
-
213
212
  # is one name a known hypocorism of the other ? (also
214
213
  # checks if both names are the same)
215
214
  if self.hypocorism_gazetteer.are_related(name1, name2):
@@ -263,7 +262,6 @@ class GraphRulesCharacterUnifier(PipelineStep):
263
262
  pass
264
263
 
265
264
  for name1, name2 in combinations(G.nodes(), 2):
266
-
267
265
  # check if characters have the same last name but a
268
266
  # different first name.
269
267
  human_name1 = HumanName(name1, constants=hname_constants)
@@ -333,10 +331,11 @@ class GraphRulesCharacterUnifier(PipelineStep):
333
331
  self, name1: str, name2: str, hname_constants: Constants
334
332
  ) -> bool:
335
333
  """Check if two names are related after removing their titles"""
336
- local_constants = copy.deepcopy(hname_constants)
337
- local_constants.string_format = "{first} {middle} {last}"
338
- raw_name1 = HumanName(name1, constants=local_constants).full_name
339
- raw_name2 = HumanName(name2, constants=local_constants).full_name
334
+ old_string_format = hname_constants.string_format
335
+ hname_constants.string_format = "{first} {middle} {last}"
336
+ raw_name1 = HumanName(name1, constants=hname_constants).full_name
337
+ raw_name2 = HumanName(name2, constants=hname_constants).full_name
338
+ hname_constants.string_format = old_string_format
340
339
 
341
340
  if raw_name1 == "" or raw_name2 == "":
342
341
  return False
@@ -1,7 +1,7 @@
1
1
  import renard.pipeline.character_unification as cu
2
2
 
3
3
  print(
4
- "[warning] the characters_extraction module is deprecated. Use character_unfication instead."
4
+ "[warning] the characters_extraction module is deprecated. Use character_unification instead."
5
5
  )
6
6
 
7
7
  Character = cu.Character