renard-pipeline 0.3.1__tar.gz → 0.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of renard-pipeline might be problematic. Click here for more details.
- {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/PKG-INFO +42 -4
- renard_pipeline-0.4.1/README.md +65 -0
- {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/pyproject.toml +6 -2
- renard_pipeline-0.4.1/renard/ner_utils.py +342 -0
- {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/character_unification.py +10 -11
- {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/characters_extraction.py +1 -1
- {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/core.py +51 -34
- {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/graph_extraction.py +7 -10
- {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/ner.py +79 -58
- {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/stanford_corenlp.py +1 -1
- renard_pipeline-0.4.1/renard/py.typed +0 -0
- {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/utils.py +1 -52
- renard_pipeline-0.3.1/README.md +0 -31
- renard_pipeline-0.3.1/renard/ner_utils.py +0 -80
- {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/LICENSE +0 -0
- {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/gender.py +0 -0
- {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/graph_utils.py +0 -0
- {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/nltk_utils.py +0 -0
- {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/__init__.py +0 -0
- {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/corefs/__init__.py +0 -0
- {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/corefs/corefs.py +0 -0
- {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/preconfigured.py +0 -0
- {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/preprocessing.py +0 -0
- {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/progress.py +0 -0
- {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/quote_detection.py +0 -0
- {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/sentiment_analysis.py +0 -0
- {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/speaker_attribution.py +0 -0
- {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/pipeline/tokenization.py +0 -0
- {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/plot_utils.py +0 -0
- {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/resources/hypocorisms/__init__.py +0 -0
- {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/resources/hypocorisms/datas/License.txt +0 -0
- {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/resources/hypocorisms/datas/hypocorisms.csv +0 -0
- {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/resources/hypocorisms/hypocorisms.py +0 -0
- {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/resources/pronouns/__init__.py +0 -0
- {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/resources/pronouns/pronouns.py +0 -0
- {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/resources/titles/__init__.py +0 -0
- {renard_pipeline-0.3.1 → renard_pipeline-0.4.1}/renard/resources/titles/titles.py +0 -0
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: renard-pipeline
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.1
|
|
4
4
|
Summary: Relationships Extraction from NARrative Documents
|
|
5
|
+
Home-page: https://github.com/CompNet/Renard
|
|
5
6
|
License: GPL-3.0-only
|
|
6
7
|
Author: Arthur Amalvy
|
|
7
8
|
Author-email: arthur.amalvy@univ-avignon.fr
|
|
@@ -14,6 +15,7 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
14
15
|
Provides-Extra: spacy
|
|
15
16
|
Provides-Extra: stanza
|
|
16
17
|
Requires-Dist: coreferee (>=1.4.0,<2.0.0) ; extra == "spacy"
|
|
18
|
+
Requires-Dist: datasets (>=2.16.1,<3.0.0)
|
|
17
19
|
Requires-Dist: grimbert (>=0.1.0,<0.2.0)
|
|
18
20
|
Requires-Dist: matplotlib (>=3.5.3,<4.0.0)
|
|
19
21
|
Requires-Dist: more-itertools (>=10.1.0,<11.0.0)
|
|
@@ -26,15 +28,19 @@ Requires-Dist: seqeval (==1.2.2)
|
|
|
26
28
|
Requires-Dist: spacy (>=3.5.0,<4.0.0) ; extra == "spacy"
|
|
27
29
|
Requires-Dist: spacy-transformers (>=1.2.1,<2.0.0) ; extra == "spacy"
|
|
28
30
|
Requires-Dist: stanza (>=1.3.0,<2.0.0) ; extra == "stanza"
|
|
29
|
-
Requires-Dist: tibert (>=0.
|
|
31
|
+
Requires-Dist: tibert (>=0.3.0,<0.4.0)
|
|
30
32
|
Requires-Dist: torch (>=2.0.0,!=2.0.1)
|
|
31
33
|
Requires-Dist: tqdm (>=4.62.3,<5.0.0)
|
|
32
34
|
Requires-Dist: transformers (>=4.36.0,<5.0.0)
|
|
35
|
+
Project-URL: Documentation, https://compnet.github.io/Renard/
|
|
36
|
+
Project-URL: Repository, https://github.com/CompNet/Renard
|
|
33
37
|
Description-Content-Type: text/markdown
|
|
34
38
|
|
|
35
39
|
# Renard
|
|
36
40
|
|
|
37
|
-
Relationships Extraction from NARrative Documents
|
|
41
|
+
Renard (Relationships Extraction from NARrative Documents) is a library for creating and using custom character networks extraction pipelines. Renard can extract dynamic as well as static character networks.
|
|
42
|
+
|
|
43
|
+

|
|
38
44
|
|
|
39
45
|
|
|
40
46
|
# Installation
|
|
@@ -43,6 +49,8 @@ You can install the latest version using pip:
|
|
|
43
49
|
|
|
44
50
|
> pip install renard-pipeline
|
|
45
51
|
|
|
52
|
+
Currently, Renard supports Python 3.8, 3.9 and 3.10.
|
|
53
|
+
|
|
46
54
|
|
|
47
55
|
# Documentation
|
|
48
56
|
|
|
@@ -53,7 +61,32 @@ If you need local documentation, it can be generated using `Sphinx`. From the `d
|
|
|
53
61
|
|
|
54
62
|
# Tutorial
|
|
55
63
|
|
|
56
|
-
|
|
64
|
+
Renard's central concept is the `Pipeline`.A `Pipeline` is a list of `PipelineStep` that are run sequentially in order to extract a character graph from a document. Here is a simple example:
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
from renard.pipeline import Pipeline
|
|
68
|
+
from renard.pipeline.tokenization import NLTKTokenizer
|
|
69
|
+
from renard.pipeline.ner import NLTKNamedEntityRecognizer
|
|
70
|
+
from renard.pipeline.character_unification import GraphRulesCharacterUnifier
|
|
71
|
+
from renard.pipeline.graph_extraction import CoOccurrencesGraphExtractor
|
|
72
|
+
|
|
73
|
+
with open("./my_doc.txt") as f:
|
|
74
|
+
text = f.read()
|
|
75
|
+
|
|
76
|
+
pipeline = Pipeline(
|
|
77
|
+
[
|
|
78
|
+
NLTKTokenizer(),
|
|
79
|
+
NLTKNamedEntityRecognizer(),
|
|
80
|
+
GraphRulesCharacterUnifier(min_appearance=10),
|
|
81
|
+
CoOccurrencesGraphExtractor(co_occurrences_dist=25)
|
|
82
|
+
]
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
out = pipeline(text)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
For more information, see `renard_tutorial.py`, which is a tutorial in the `jupytext` format. You can open it as a notebook in Jupyter Notebook (or export it as a notebook with `jupytext --to ipynb renard-tutorial.py`).
|
|
89
|
+
|
|
57
90
|
|
|
58
91
|
|
|
59
92
|
# Running tests
|
|
@@ -64,3 +97,8 @@ If you need local documentation, it can be generated using `Sphinx`. From the `d
|
|
|
64
97
|
|
|
65
98
|
Expensive tests are disabled by default. These can be run by setting the environment variable `RENARD_TEST_ALL` to `1`.
|
|
66
99
|
|
|
100
|
+
|
|
101
|
+
# Contributing
|
|
102
|
+
|
|
103
|
+
see [the "Contributing" section of the documentation](https://compnet.github.io/Renard/contributing.html).
|
|
104
|
+
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# Renard
|
|
2
|
+
|
|
3
|
+
Renard (Relationships Extraction from NARrative Documents) is a library for creating and using custom character networks extraction pipelines. Renard can extract dynamic as well as static character networks.
|
|
4
|
+
|
|
5
|
+

|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
# Installation
|
|
9
|
+
|
|
10
|
+
You can install the latest version using pip:
|
|
11
|
+
|
|
12
|
+
> pip install renard-pipeline
|
|
13
|
+
|
|
14
|
+
Currently, Renard supports Python 3.8, 3.9 and 3.10.
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# Documentation
|
|
18
|
+
|
|
19
|
+
Documentation, including installation instructions, can be found at https://compnet.github.io/Renard/
|
|
20
|
+
|
|
21
|
+
If you need local documentation, it can be generated using `Sphinx`. From the `docs` directory, `make html` should create documentation under `docs/_build/html`.
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# Tutorial
|
|
25
|
+
|
|
26
|
+
Renard's central concept is the `Pipeline`.A `Pipeline` is a list of `PipelineStep` that are run sequentially in order to extract a character graph from a document. Here is a simple example:
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
from renard.pipeline import Pipeline
|
|
30
|
+
from renard.pipeline.tokenization import NLTKTokenizer
|
|
31
|
+
from renard.pipeline.ner import NLTKNamedEntityRecognizer
|
|
32
|
+
from renard.pipeline.character_unification import GraphRulesCharacterUnifier
|
|
33
|
+
from renard.pipeline.graph_extraction import CoOccurrencesGraphExtractor
|
|
34
|
+
|
|
35
|
+
with open("./my_doc.txt") as f:
|
|
36
|
+
text = f.read()
|
|
37
|
+
|
|
38
|
+
pipeline = Pipeline(
|
|
39
|
+
[
|
|
40
|
+
NLTKTokenizer(),
|
|
41
|
+
NLTKNamedEntityRecognizer(),
|
|
42
|
+
GraphRulesCharacterUnifier(min_appearance=10),
|
|
43
|
+
CoOccurrencesGraphExtractor(co_occurrences_dist=25)
|
|
44
|
+
]
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
out = pipeline(text)
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
For more information, see `renard_tutorial.py`, which is a tutorial in the `jupytext` format. You can open it as a notebook in Jupyter Notebook (or export it as a notebook with `jupytext --to ipynb renard-tutorial.py`).
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# Running tests
|
|
55
|
+
|
|
56
|
+
`Renard` uses `pytest` for testing. To launch tests, use the following command :
|
|
57
|
+
|
|
58
|
+
> poetry run python -m pytest tests
|
|
59
|
+
|
|
60
|
+
Expensive tests are disabled by default. These can be run by setting the environment variable `RENARD_TEST_ALL` to `1`.
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# Contributing
|
|
64
|
+
|
|
65
|
+
see [the "Contributing" section of the documentation](https://compnet.github.io/Renard/contributing.html).
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "renard-pipeline"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.4.1"
|
|
4
4
|
description = "Relationships Extraction from NARrative Documents"
|
|
5
5
|
authors = ["Arthur Amalvy <arthur.amalvy@univ-avignon.fr>"]
|
|
6
6
|
license = "GPL-3.0-only"
|
|
@@ -8,6 +8,9 @@ readme = "README.md"
|
|
|
8
8
|
packages = [
|
|
9
9
|
{ include = "renard" }
|
|
10
10
|
]
|
|
11
|
+
homepage = "https://github.com/CompNet/Renard"
|
|
12
|
+
repository = "https://github.com/CompNet/Renard"
|
|
13
|
+
documentation = "https://compnet.github.io/Renard/"
|
|
11
14
|
|
|
12
15
|
[tool.poetry.dependencies]
|
|
13
16
|
# optional dependencies
|
|
@@ -28,8 +31,9 @@ matplotlib = "^3.5.3"
|
|
|
28
31
|
seqeval = "1.2.2"
|
|
29
32
|
pandas = "^2.0.0"
|
|
30
33
|
pytest = "^7.2.1"
|
|
31
|
-
tibert = "^0.
|
|
34
|
+
tibert = "^0.3.0"
|
|
32
35
|
grimbert = "^0.1.0"
|
|
36
|
+
datasets = "^2.16.1"
|
|
33
37
|
|
|
34
38
|
[tool.poetry.dev-dependencies]
|
|
35
39
|
hypothesis = "^6.24.0"
|
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import TYPE_CHECKING, List, Optional, Union, Dict, Tuple
|
|
3
|
+
import os, re
|
|
4
|
+
import itertools as it
|
|
5
|
+
import functools as ft
|
|
6
|
+
from more_itertools import flatten
|
|
7
|
+
import torch
|
|
8
|
+
from torch.utils.data import Dataset
|
|
9
|
+
from datasets import Dataset as HGDataset
|
|
10
|
+
from datasets import Sequence, ClassLabel
|
|
11
|
+
from transformers import (
|
|
12
|
+
AutoModelForTokenClassification,
|
|
13
|
+
AutoTokenizer,
|
|
14
|
+
PreTrainedTokenizerFast,
|
|
15
|
+
PreTrainedModel,
|
|
16
|
+
Trainer,
|
|
17
|
+
TrainingArguments,
|
|
18
|
+
)
|
|
19
|
+
from transformers.tokenization_utils_base import BatchEncoding
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from renard.pipeline.ner import NEREntity
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class DataCollatorForTokenClassificationWithBatchEncoding:
|
|
26
|
+
"""Same as ``transformers.DataCollatorForTokenClassification``,
|
|
27
|
+
except it correctly returns a ``BatchEncoding`` object with
|
|
28
|
+
correct ``encodings`` attribute.
|
|
29
|
+
|
|
30
|
+
Don't know why this is not the default ?
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
tokenizer: PreTrainedTokenizerFast,
|
|
36
|
+
pad_to_multiple_of: Optional[int] = None,
|
|
37
|
+
) -> None:
|
|
38
|
+
self.tokenizer = tokenizer
|
|
39
|
+
self.pad_to_multiple_of = pad_to_multiple_of
|
|
40
|
+
self.pad_token_id = {"label": -100, "labels": -100}
|
|
41
|
+
|
|
42
|
+
def __call__(self, features: List[dict]) -> Union[dict, BatchEncoding]:
|
|
43
|
+
keys = features[0].keys()
|
|
44
|
+
sequence_len = max([len(f["input_ids"]) for f in features])
|
|
45
|
+
|
|
46
|
+
# We do the padding and collating manually instead of calling
|
|
47
|
+
# self.tokenizer.pad, because pad does not work on arbitrary
|
|
48
|
+
# features.
|
|
49
|
+
batch = BatchEncoding({})
|
|
50
|
+
for key in keys:
|
|
51
|
+
if self.tokenizer.padding_side == "right":
|
|
52
|
+
batch[key] = [
|
|
53
|
+
f[key]
|
|
54
|
+
+ [self.pad_token_id.get(key, 0)] * (sequence_len - len(f[key]))
|
|
55
|
+
for f in features
|
|
56
|
+
]
|
|
57
|
+
else:
|
|
58
|
+
batch[key] = [
|
|
59
|
+
[
|
|
60
|
+
self.pad_token_id.get(key, 0) * (sequence_len - len(f[key]))
|
|
61
|
+
+ f[key]
|
|
62
|
+
for f in features
|
|
63
|
+
]
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
batch._encodings = [f.encodings[0] for f in features]
|
|
67
|
+
|
|
68
|
+
for k, v in batch.items():
|
|
69
|
+
batch[k] = torch.tensor(v)
|
|
70
|
+
|
|
71
|
+
return batch
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class NERDataset(Dataset):
|
|
75
|
+
"""
|
|
76
|
+
:ivar _context_mask: for each element, a mask indicating which
|
|
77
|
+
tokens are part of the context (1 for context, 0 for text on
|
|
78
|
+
which to perform inference). The mask allows to discard
|
|
79
|
+
predictions made for context at inference time, even though
|
|
80
|
+
the context can still be passed as input to the model.
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
def __init__(
|
|
84
|
+
self,
|
|
85
|
+
elements: List[List[str]],
|
|
86
|
+
tokenizer: PreTrainedTokenizerFast,
|
|
87
|
+
context_mask: Optional[List[List[int]]] = None,
|
|
88
|
+
) -> None:
|
|
89
|
+
self.elements = elements
|
|
90
|
+
|
|
91
|
+
if context_mask:
|
|
92
|
+
assert all(
|
|
93
|
+
[len(cm) == len(elt) for elt, cm in zip(self.elements, context_mask)]
|
|
94
|
+
)
|
|
95
|
+
self._context_mask = context_mask or [[0] * len(elt) for elt in self.elements]
|
|
96
|
+
|
|
97
|
+
self.tokenizer = tokenizer
|
|
98
|
+
|
|
99
|
+
def __getitem__(self, index: Union[int, List[int]]) -> BatchEncoding:
|
|
100
|
+
element = self.elements[index]
|
|
101
|
+
|
|
102
|
+
batch = self.tokenizer(
|
|
103
|
+
element,
|
|
104
|
+
truncation=True,
|
|
105
|
+
max_length=512, # TODO
|
|
106
|
+
is_split_into_words=True,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
batch["context_mask"] = [0] * len(batch["input_ids"])
|
|
110
|
+
elt_context_mask = self._context_mask[index]
|
|
111
|
+
for i in range(len(element)):
|
|
112
|
+
w2t = batch.word_to_tokens(0, i)
|
|
113
|
+
mask_value = elt_context_mask[i]
|
|
114
|
+
tokens_mask = [mask_value] * (w2t.end - w2t.start)
|
|
115
|
+
batch["context_mask"][w2t.start : w2t.end] = tokens_mask
|
|
116
|
+
|
|
117
|
+
return batch
|
|
118
|
+
|
|
119
|
+
def __len__(self) -> int:
|
|
120
|
+
return len(self.elements)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def ner_entities(
|
|
124
|
+
tokens: List[str], bio_tags: List[str], resolve_inconsistencies: bool = True
|
|
125
|
+
) -> List[NEREntity]:
|
|
126
|
+
"""Extract NER entities from a list of BIO tags
|
|
127
|
+
|
|
128
|
+
:param tokens: a list of tokens
|
|
129
|
+
:param bio_tags: a list of BIO tags. In particular, BIO tags
|
|
130
|
+
should be in the CoNLL-2002 form (such as 'B-PER I-PER')
|
|
131
|
+
|
|
132
|
+
:return: A list of ner entities, in apparition order
|
|
133
|
+
"""
|
|
134
|
+
from renard.pipeline.ner import NEREntity
|
|
135
|
+
|
|
136
|
+
assert len(tokens) == len(bio_tags)
|
|
137
|
+
|
|
138
|
+
entities = []
|
|
139
|
+
current_tag: Optional[str] = None
|
|
140
|
+
current_tag_start_idx: Optional[int] = None
|
|
141
|
+
|
|
142
|
+
for i, tag in enumerate(bio_tags):
|
|
143
|
+
if not current_tag is None and not tag.startswith("I-"):
|
|
144
|
+
assert not current_tag_start_idx is None
|
|
145
|
+
entities.append(
|
|
146
|
+
NEREntity(
|
|
147
|
+
tokens[current_tag_start_idx:i],
|
|
148
|
+
current_tag_start_idx,
|
|
149
|
+
i,
|
|
150
|
+
current_tag,
|
|
151
|
+
)
|
|
152
|
+
)
|
|
153
|
+
current_tag = None
|
|
154
|
+
current_tag_start_idx = None
|
|
155
|
+
|
|
156
|
+
if tag.startswith("B-"):
|
|
157
|
+
current_tag = tag[2:]
|
|
158
|
+
current_tag_start_idx = i
|
|
159
|
+
|
|
160
|
+
elif tag.startswith("I-"):
|
|
161
|
+
if current_tag is None and resolve_inconsistencies:
|
|
162
|
+
current_tag = tag[2:]
|
|
163
|
+
current_tag_start_idx = i
|
|
164
|
+
continue
|
|
165
|
+
|
|
166
|
+
if not current_tag is None:
|
|
167
|
+
assert not current_tag_start_idx is None
|
|
168
|
+
entities.append(
|
|
169
|
+
NEREntity(
|
|
170
|
+
tokens[current_tag_start_idx : len(tokens)],
|
|
171
|
+
current_tag_start_idx,
|
|
172
|
+
len(bio_tags),
|
|
173
|
+
current_tag,
|
|
174
|
+
)
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
return entities
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def load_conll2002_bio(
|
|
181
|
+
path: str,
|
|
182
|
+
tag_conversion_map: Optional[Dict[str, str]] = None,
|
|
183
|
+
separator: str = "\t",
|
|
184
|
+
**kwargs,
|
|
185
|
+
) -> Tuple[List[List[str]], List[str], List[NEREntity]]:
|
|
186
|
+
"""Load a file under CoNLL2022 BIO format. Sentences are expected
|
|
187
|
+
to be separated by end of lines. Tags should be in the CoNLL-2002
|
|
188
|
+
format (such as 'B-PER I-PER') - If this is not the case, see the
|
|
189
|
+
``tag_conversion_map`` argument.
|
|
190
|
+
|
|
191
|
+
:param path: path to the CoNLL-2002 formatted file
|
|
192
|
+
:param separator: separator between token and BIO tags
|
|
193
|
+
:param tag_conversion_map: conversion map for tags found in the
|
|
194
|
+
input file. Example : ``{'B': 'B-PER', 'I': 'I-PER'}``
|
|
195
|
+
:param kwargs: additional kwargs for ``open`` (such as
|
|
196
|
+
``encoding`` or ``newline``).
|
|
197
|
+
|
|
198
|
+
:return: ``(sentences, tokens, entities)``
|
|
199
|
+
"""
|
|
200
|
+
tag_conversion_map = tag_conversion_map or {}
|
|
201
|
+
|
|
202
|
+
with open(os.path.expanduser(path), **kwargs) as f:
|
|
203
|
+
raw_data = f.read()
|
|
204
|
+
|
|
205
|
+
sents = []
|
|
206
|
+
sent_tokens = []
|
|
207
|
+
tags = []
|
|
208
|
+
for line in raw_data.split("\n"):
|
|
209
|
+
line = line.strip("\n")
|
|
210
|
+
if re.fullmatch(r"\s*", line):
|
|
211
|
+
if len(sent_tokens) == 0:
|
|
212
|
+
continue
|
|
213
|
+
sents.append(sent_tokens)
|
|
214
|
+
sent_tokens = []
|
|
215
|
+
continue
|
|
216
|
+
token, tag = line.split(separator)
|
|
217
|
+
sent_tokens.append(token)
|
|
218
|
+
tags.append(tag_conversion_map.get(tag, tag))
|
|
219
|
+
|
|
220
|
+
tokens = list(flatten(sents))
|
|
221
|
+
entities = ner_entities(tokens, tags)
|
|
222
|
+
|
|
223
|
+
return sents, list(flatten(sents)), entities
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def hgdataset_from_conll2002(
|
|
227
|
+
path: str,
|
|
228
|
+
tag_conversion_map: Optional[Dict[str, str]] = None,
|
|
229
|
+
separator: str = "\t",
|
|
230
|
+
**kwargs,
|
|
231
|
+
) -> HGDataset:
|
|
232
|
+
"""Load a CoNLL-2002 file as a Huggingface Dataset.
|
|
233
|
+
|
|
234
|
+
:param path: passed to :func:`.load_conll2002_bio`
|
|
235
|
+
:param tag_conversion_map: passed to :func:`load_conll2002_bio`
|
|
236
|
+
:param separator: passed to :func:`load_conll2002_bio`
|
|
237
|
+
:param kwargs: passed to :func:`load_conll2002_bio`
|
|
238
|
+
|
|
239
|
+
:return: a :class:`datasets.Dataset` with features 'tokens' and 'labels'.
|
|
240
|
+
"""
|
|
241
|
+
sentences, tokens, entities = load_conll2002_bio(
|
|
242
|
+
path, tag_conversion_map, separator, **kwargs
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
# convert entities to labels
|
|
246
|
+
tags = ["O"] * len(tokens)
|
|
247
|
+
for entity in entities:
|
|
248
|
+
entity_len = entity.end_idx - entity.start_idx
|
|
249
|
+
tags[entity.start_idx : entity.end_idx] = [f"B-{entity.tag}"] + [
|
|
250
|
+
f"I-{entity.tag}"
|
|
251
|
+
] * (entity_len - 1)
|
|
252
|
+
|
|
253
|
+
# cut into sentences
|
|
254
|
+
sent_ends = list(it.accumulate([len(s) for s in sentences]))
|
|
255
|
+
sent_starts = [0] + sent_ends[:-1]
|
|
256
|
+
sent_tags = [
|
|
257
|
+
tags[sent_start:sent_end]
|
|
258
|
+
for sent_start, sent_end in zip(sent_starts, sent_ends)
|
|
259
|
+
]
|
|
260
|
+
|
|
261
|
+
dataset = HGDataset.from_dict({"tokens": sentences, "labels": sent_tags})
|
|
262
|
+
dataset = dataset.cast_column(
|
|
263
|
+
"labels", Sequence(ClassLabel(names=sorted(set(tags))))
|
|
264
|
+
)
|
|
265
|
+
return dataset
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def _tokenize_and_align_labels(
|
|
269
|
+
examples, tokenizer: PreTrainedTokenizerFast, label_all_tokens: bool = True
|
|
270
|
+
):
|
|
271
|
+
"""Adapted from https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/token_classification.ipynb#scrollTo=vc0BSBLIIrJQ
|
|
272
|
+
|
|
273
|
+
:param examples: an object with keys 'tokens' and 'labels'
|
|
274
|
+
"""
|
|
275
|
+
tokenized_inputs = tokenizer(
|
|
276
|
+
examples["tokens"], truncation=True, is_split_into_words=True
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
labels = []
|
|
280
|
+
for i, label in enumerate(examples[f"labels"]):
|
|
281
|
+
word_ids = tokenized_inputs.word_ids(batch_index=i)
|
|
282
|
+
previous_word_idx = None
|
|
283
|
+
label_ids = []
|
|
284
|
+
for word_idx in word_ids:
|
|
285
|
+
# Special tokens have a word id that is None. We set the
|
|
286
|
+
# label to -100 so they are automatically ignored in the
|
|
287
|
+
# loss function.
|
|
288
|
+
if word_idx is None:
|
|
289
|
+
label_ids.append(-100)
|
|
290
|
+
# We set the label for the first token of each word.
|
|
291
|
+
elif word_idx != previous_word_idx:
|
|
292
|
+
label_ids.append(label[word_idx])
|
|
293
|
+
# For the other tokens in a word, we set the label to
|
|
294
|
+
# either the current label or -100, depending on the
|
|
295
|
+
# label_all_tokens flag.
|
|
296
|
+
else:
|
|
297
|
+
label_ids.append(label[word_idx] if label_all_tokens else -100)
|
|
298
|
+
previous_word_idx = word_idx
|
|
299
|
+
|
|
300
|
+
labels.append(label_ids)
|
|
301
|
+
|
|
302
|
+
tokenized_inputs["labels"] = labels
|
|
303
|
+
|
|
304
|
+
return tokenized_inputs
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def train_ner_model(
|
|
308
|
+
hg_id: str,
|
|
309
|
+
dataset: HGDataset,
|
|
310
|
+
targs: TrainingArguments,
|
|
311
|
+
) -> PreTrainedModel:
|
|
312
|
+
from transformers import DataCollatorForTokenClassification
|
|
313
|
+
|
|
314
|
+
# BERT tokenizer splits tokens into subtokens. The
|
|
315
|
+
# tokenize_and_align_labels function correctly aligns labels and
|
|
316
|
+
# subtokens.
|
|
317
|
+
tokenizer = AutoTokenizer.from_pretrained(hg_id)
|
|
318
|
+
dataset = dataset.map(
|
|
319
|
+
ft.partial(_tokenize_and_align_labels, tokenizer=tokenizer), batched=True
|
|
320
|
+
)
|
|
321
|
+
dataset = dataset.train_test_split(test_size=0.1)
|
|
322
|
+
|
|
323
|
+
label_lst = dataset["train"].features["labels"].feature.names
|
|
324
|
+
model = AutoModelForTokenClassification.from_pretrained(
|
|
325
|
+
hg_id,
|
|
326
|
+
num_labels=len(label_lst),
|
|
327
|
+
id2label={i: label for i, label in enumerate(label_lst)},
|
|
328
|
+
label2id={label: i for i, label in enumerate(label_lst)},
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
trainer = Trainer(
|
|
332
|
+
model,
|
|
333
|
+
targs,
|
|
334
|
+
train_dataset=dataset["train"],
|
|
335
|
+
eval_dataset=dataset["test"],
|
|
336
|
+
# data_collator=DataCollatorForTokenClassificationWithBatchEncoding(tokenizer),
|
|
337
|
+
data_collator=DataCollatorForTokenClassification(tokenizer),
|
|
338
|
+
tokenizer=tokenizer,
|
|
339
|
+
)
|
|
340
|
+
trainer.train()
|
|
341
|
+
|
|
342
|
+
return model
|
|
@@ -54,8 +54,8 @@ def _assign_coreference_mentions(
|
|
|
54
54
|
:param corefs:
|
|
55
55
|
"""
|
|
56
56
|
|
|
57
|
-
char_mentions: Dict[Character,
|
|
58
|
-
character: character.mentions for character in characters
|
|
57
|
+
char_mentions: Dict[Character, Set[Mention]] = {
|
|
58
|
+
character: set(character.mentions) for character in characters
|
|
59
59
|
}
|
|
60
60
|
|
|
61
61
|
# we assign each chain to the character with highest name
|
|
@@ -80,12 +80,12 @@ def _assign_coreference_mentions(
|
|
|
80
80
|
|
|
81
81
|
# assign the chain to the character with the most occurences
|
|
82
82
|
for mention in chain:
|
|
83
|
-
# TODO: complexity
|
|
84
83
|
if not mention in char_mentions[best_character]:
|
|
85
|
-
char_mentions[best_character].
|
|
84
|
+
char_mentions[best_character].add(mention)
|
|
86
85
|
|
|
87
86
|
return [
|
|
88
|
-
Character(c.names, mentions,
|
|
87
|
+
Character(c.names, sorted(mentions, key=lambda m: m.start_idx), c.gender)
|
|
88
|
+
for c, mentions in char_mentions.items()
|
|
89
89
|
]
|
|
90
90
|
|
|
91
91
|
|
|
@@ -209,7 +209,6 @@ class GraphRulesCharacterUnifier(PipelineStep):
|
|
|
209
209
|
|
|
210
210
|
# * link nodes based on several rules
|
|
211
211
|
for name1, name2 in combinations(G.nodes(), 2):
|
|
212
|
-
|
|
213
212
|
# is one name a known hypocorism of the other ? (also
|
|
214
213
|
# checks if both names are the same)
|
|
215
214
|
if self.hypocorism_gazetteer.are_related(name1, name2):
|
|
@@ -263,7 +262,6 @@ class GraphRulesCharacterUnifier(PipelineStep):
|
|
|
263
262
|
pass
|
|
264
263
|
|
|
265
264
|
for name1, name2 in combinations(G.nodes(), 2):
|
|
266
|
-
|
|
267
265
|
# check if characters have the same last name but a
|
|
268
266
|
# different first name.
|
|
269
267
|
human_name1 = HumanName(name1, constants=hname_constants)
|
|
@@ -333,10 +331,11 @@ class GraphRulesCharacterUnifier(PipelineStep):
|
|
|
333
331
|
self, name1: str, name2: str, hname_constants: Constants
|
|
334
332
|
) -> bool:
|
|
335
333
|
"""Check if two names are related after removing their titles"""
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
raw_name1 = HumanName(name1, constants=
|
|
339
|
-
raw_name2 = HumanName(name2, constants=
|
|
334
|
+
old_string_format = hname_constants.string_format
|
|
335
|
+
hname_constants.string_format = "{first} {middle} {last}"
|
|
336
|
+
raw_name1 = HumanName(name1, constants=hname_constants).full_name
|
|
337
|
+
raw_name2 = HumanName(name2, constants=hname_constants).full_name
|
|
338
|
+
hname_constants.string_format = old_string_format
|
|
340
339
|
|
|
341
340
|
if raw_name1 == "" or raw_name2 == "":
|
|
342
341
|
return False
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import renard.pipeline.character_unification as cu
|
|
2
2
|
|
|
3
3
|
print(
|
|
4
|
-
"[warning] the characters_extraction module is deprecated. Use
|
|
4
|
+
"[warning] the characters_extraction module is deprecated. Use character_unification instead."
|
|
5
5
|
)
|
|
6
6
|
|
|
7
7
|
Character = cu.Character
|