renard-pipeline 0.4.1__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of renard-pipeline might be problematic. Click here for more details.
- {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/PKG-INFO +45 -20
- {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/README.md +26 -2
- {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/pyproject.toml +19 -19
- {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/graph_utils.py +11 -4
- {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/ner_utils.py +24 -14
- {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/pipeline/character_unification.py +72 -19
- {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/pipeline/characters_extraction.py +3 -1
- {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/pipeline/core.py +141 -26
- {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/pipeline/corefs/corefs.py +32 -33
- renard_pipeline-0.6.0/renard/pipeline/graph_extraction.py +604 -0
- renard_pipeline-0.6.0/renard/pipeline/ner/__init__.py +1 -0
- {renard_pipeline-0.4.1/renard/pipeline → renard_pipeline-0.6.0/renard/pipeline/ner}/ner.py +47 -76
- renard_pipeline-0.6.0/renard/pipeline/ner/retrieval.py +375 -0
- {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/pipeline/progress.py +32 -1
- {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/pipeline/speaker_attribution.py +2 -3
- renard_pipeline-0.6.0/renard/pipeline/tokenization.py +84 -0
- renard_pipeline-0.6.0/renard/plot_utils.py +87 -0
- renard_pipeline-0.6.0/renard/resources/determiners/__init__.py +1 -0
- renard_pipeline-0.6.0/renard/resources/determiners/determiners.py +41 -0
- {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/resources/hypocorisms/hypocorisms.py +3 -2
- {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/utils.py +57 -1
- renard_pipeline-0.4.1/renard/pipeline/graph_extraction.py +0 -515
- renard_pipeline-0.4.1/renard/pipeline/tokenization.py +0 -55
- renard_pipeline-0.4.1/renard/plot_utils.py +0 -67
- {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/LICENSE +0 -0
- {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/gender.py +0 -0
- {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/nltk_utils.py +0 -0
- {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/pipeline/__init__.py +0 -0
- {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/pipeline/corefs/__init__.py +0 -0
- {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/pipeline/preconfigured.py +0 -0
- {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/pipeline/preprocessing.py +0 -0
- {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/pipeline/quote_detection.py +0 -0
- {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/pipeline/sentiment_analysis.py +0 -0
- {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/pipeline/stanford_corenlp.py +0 -0
- {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/py.typed +0 -0
- {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/resources/hypocorisms/__init__.py +0 -0
- {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/resources/hypocorisms/datas/License.txt +0 -0
- {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/resources/hypocorisms/datas/hypocorisms.csv +0 -0
- {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/resources/pronouns/__init__.py +0 -0
- {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/resources/pronouns/pronouns.py +0 -0
- {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/resources/titles/__init__.py +0 -0
- {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/resources/titles/titles.py +0 -0
|
@@ -1,46 +1,49 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: renard-pipeline
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Relationships Extraction from NARrative Documents
|
|
5
5
|
Home-page: https://github.com/CompNet/Renard
|
|
6
6
|
License: GPL-3.0-only
|
|
7
7
|
Author: Arthur Amalvy
|
|
8
8
|
Author-email: arthur.amalvy@univ-avignon.fr
|
|
9
|
-
Requires-Python: >=3.8,<3.
|
|
9
|
+
Requires-Python: >=3.8,<3.12
|
|
10
10
|
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.8
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.9
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
16
|
Provides-Extra: spacy
|
|
16
17
|
Provides-Extra: stanza
|
|
17
|
-
Requires-Dist: coreferee (>=1.4
|
|
18
|
-
Requires-Dist: datasets (>=
|
|
19
|
-
Requires-Dist: grimbert (>=0.1
|
|
20
|
-
Requires-Dist: matplotlib (>=3.5
|
|
21
|
-
Requires-Dist: more-itertools (>=10.
|
|
22
|
-
Requires-Dist: nameparser (>=1.1
|
|
23
|
-
Requires-Dist: networkx (>=
|
|
24
|
-
Requires-Dist: nltk (>=3.
|
|
25
|
-
Requires-Dist: pandas (>=2.0
|
|
26
|
-
Requires-Dist: pytest (>=
|
|
27
|
-
Requires-Dist:
|
|
28
|
-
Requires-Dist: spacy (>=3.5
|
|
29
|
-
Requires-Dist: spacy-transformers (>=1.
|
|
30
|
-
Requires-Dist: stanza (>=1.3
|
|
31
|
-
Requires-Dist: tibert (>=0.
|
|
18
|
+
Requires-Dist: coreferee (>=1.4,<2.0) ; extra == "spacy"
|
|
19
|
+
Requires-Dist: datasets (>=3.0,<4.0)
|
|
20
|
+
Requires-Dist: grimbert (>=0.1,<0.2)
|
|
21
|
+
Requires-Dist: matplotlib (>=3.5,<4.0)
|
|
22
|
+
Requires-Dist: more-itertools (>=10.5,<11.0)
|
|
23
|
+
Requires-Dist: nameparser (>=1.1,<2.0)
|
|
24
|
+
Requires-Dist: networkx (>=3.0,<4.0)
|
|
25
|
+
Requires-Dist: nltk (>=3.9,<4.0)
|
|
26
|
+
Requires-Dist: pandas (>=2.0,<3.0)
|
|
27
|
+
Requires-Dist: pytest (>=8.3.0,<9.0.0)
|
|
28
|
+
Requires-Dist: rank-bm25 (>=0.2.2,<0.3.0)
|
|
29
|
+
Requires-Dist: spacy (>=3.5,<4.0) ; extra == "spacy"
|
|
30
|
+
Requires-Dist: spacy-transformers (>=1.3,<2.0) ; extra == "spacy"
|
|
31
|
+
Requires-Dist: stanza (>=1.3,<2.0) ; extra == "stanza"
|
|
32
|
+
Requires-Dist: tibert (>=0.5,<0.6)
|
|
32
33
|
Requires-Dist: torch (>=2.0.0,!=2.0.1)
|
|
33
34
|
Requires-Dist: tqdm (>=4.62.3,<5.0.0)
|
|
34
|
-
Requires-Dist: transformers (>=4.36
|
|
35
|
+
Requires-Dist: transformers (>=4.36,<5.0)
|
|
35
36
|
Project-URL: Documentation, https://compnet.github.io/Renard/
|
|
36
37
|
Project-URL: Repository, https://github.com/CompNet/Renard
|
|
37
38
|
Description-Content-Type: text/markdown
|
|
38
39
|
|
|
39
40
|
# Renard
|
|
40
41
|
|
|
41
|
-
|
|
42
|
+
[](https://doi.org/10.21105/joss.06574)
|
|
42
43
|
|
|
43
|
-
|
|
44
|
+
Renard (Relationship Extraction from NARrative Documents) is a library for creating and using custom character networks extraction pipelines. Renard can extract dynamic as well as static character networks.
|
|
45
|
+
|
|
46
|
+

|
|
44
47
|
|
|
45
48
|
|
|
46
49
|
# Installation
|
|
@@ -102,3 +105,25 @@ Expensive tests are disabled by default. These can be run by setting the environ
|
|
|
102
105
|
|
|
103
106
|
see [the "Contributing" section of the documentation](https://compnet.github.io/Renard/contributing.html).
|
|
104
107
|
|
|
108
|
+
|
|
109
|
+
# How to cite
|
|
110
|
+
|
|
111
|
+
If you use Renard in your research project, please cite it as follows:
|
|
112
|
+
|
|
113
|
+
```bibtex
|
|
114
|
+
@Article{Amalvy2024,
|
|
115
|
+
doi = {10.21105/joss.06574},
|
|
116
|
+
year = {2024},
|
|
117
|
+
publisher = {The Open Journal},
|
|
118
|
+
volume = {9},
|
|
119
|
+
number = {98},
|
|
120
|
+
pages = {6574},
|
|
121
|
+
author = {Amalvy, A. and Labatut, V. and Dufour, R.},
|
|
122
|
+
title = {Renard: A Modular Pipeline for Extracting Character
|
|
123
|
+
Networks from Narrative Texts},
|
|
124
|
+
journal = {Journal of Open Source Software},
|
|
125
|
+
}
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
We would be happy to hear about your usage of Renard, so don't hesitate to reach out!
|
|
129
|
+
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
# Renard
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
[](https://doi.org/10.21105/joss.06574)
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
Renard (Relationship Extraction from NARrative Documents) is a library for creating and using custom character networks extraction pipelines. Renard can extract dynamic as well as static character networks.
|
|
6
|
+
|
|
7
|
+

|
|
6
8
|
|
|
7
9
|
|
|
8
10
|
# Installation
|
|
@@ -63,3 +65,25 @@ Expensive tests are disabled by default. These can be run by setting the environ
|
|
|
63
65
|
# Contributing
|
|
64
66
|
|
|
65
67
|
see [the "Contributing" section of the documentation](https://compnet.github.io/Renard/contributing.html).
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# How to cite
|
|
71
|
+
|
|
72
|
+
If you use Renard in your research project, please cite it as follows:
|
|
73
|
+
|
|
74
|
+
```bibtex
|
|
75
|
+
@Article{Amalvy2024,
|
|
76
|
+
doi = {10.21105/joss.06574},
|
|
77
|
+
year = {2024},
|
|
78
|
+
publisher = {The Open Journal},
|
|
79
|
+
volume = {9},
|
|
80
|
+
number = {98},
|
|
81
|
+
pages = {6574},
|
|
82
|
+
author = {Amalvy, A. and Labatut, V. and Dufour, R.},
|
|
83
|
+
title = {Renard: A Modular Pipeline for Extracting Character
|
|
84
|
+
Networks from Narrative Texts},
|
|
85
|
+
journal = {Journal of Open Source Software},
|
|
86
|
+
}
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
We would be happy to hear about your usage of Renard, so don't hesitate to reach out!
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "renard-pipeline"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.6.0"
|
|
4
4
|
description = "Relationships Extraction from NARrative Documents"
|
|
5
5
|
authors = ["Arthur Amalvy <arthur.amalvy@univ-avignon.fr>"]
|
|
6
6
|
license = "GPL-3.0-only"
|
|
@@ -14,29 +14,29 @@ documentation = "https://compnet.github.io/Renard/"
|
|
|
14
14
|
|
|
15
15
|
[tool.poetry.dependencies]
|
|
16
16
|
# optional dependencies
|
|
17
|
-
stanza = { version = "^1.3
|
|
18
|
-
spacy = { version = "^3.5
|
|
19
|
-
coreferee = { version = "^1.4
|
|
20
|
-
spacy-transformers = {version = "^1.
|
|
17
|
+
stanza = { version = "^1.3", optional = true }
|
|
18
|
+
spacy = { version = "^3.5", optional = true }
|
|
19
|
+
coreferee = { version = "^1.4", optional = true }
|
|
20
|
+
spacy-transformers = {version = "^1.3", optional = true}
|
|
21
21
|
# required dependencies
|
|
22
|
-
python = "^3.8,<3.
|
|
22
|
+
python = "^3.8,<3.12"
|
|
23
23
|
torch = ">=2.0.0, !=2.0.1"
|
|
24
|
-
transformers = "^4.36
|
|
25
|
-
nltk = "^3.
|
|
24
|
+
transformers = "^4.36"
|
|
25
|
+
nltk = "^3.9"
|
|
26
26
|
tqdm = "^4.62.3"
|
|
27
|
-
networkx = "^
|
|
28
|
-
more-itertools = "^10.
|
|
29
|
-
nameparser = "^1.1
|
|
30
|
-
matplotlib = "^3.5
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
27
|
+
networkx = "^3.0"
|
|
28
|
+
more-itertools = "^10.5"
|
|
29
|
+
nameparser = "^1.1"
|
|
30
|
+
matplotlib = "^3.5"
|
|
31
|
+
pandas = "^2.0"
|
|
32
|
+
pytest = "^8.3.0"
|
|
33
|
+
tibert = "^0.5"
|
|
34
|
+
grimbert = "^0.1"
|
|
35
|
+
datasets = "^3.0"
|
|
36
|
+
rank-bm25 = "^0.2.2"
|
|
37
37
|
|
|
38
38
|
[tool.poetry.dev-dependencies]
|
|
39
|
-
hypothesis = "^6.
|
|
39
|
+
hypothesis = "^6.82"
|
|
40
40
|
Sphinx = "^4.3.1"
|
|
41
41
|
sphinx-rtd-theme = "^1.0.0"
|
|
42
42
|
sphinx-autodoc-typehints = "^1.12.0"
|
|
@@ -70,10 +70,17 @@ def graph_with_names(
|
|
|
70
70
|
else:
|
|
71
71
|
name_style_fn = name_style
|
|
72
72
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
73
|
+
mapping = {}
|
|
74
|
+
for character in G.nodes():
|
|
75
|
+
# NOTE: it is *possible* to have a graph where nodes are not
|
|
76
|
+
# characters (for example, simple strings). Therefore, we are
|
|
77
|
+
# lenient here
|
|
78
|
+
try:
|
|
79
|
+
mapping[character] = name_style_fn(character)
|
|
80
|
+
except AttributeError:
|
|
81
|
+
mapping[character] = character
|
|
82
|
+
|
|
83
|
+
return nx.relabel_nodes(G, mapping)
|
|
77
84
|
|
|
78
85
|
|
|
79
86
|
def layout_with_names(
|
|
@@ -74,7 +74,7 @@ class DataCollatorForTokenClassificationWithBatchEncoding:
|
|
|
74
74
|
class NERDataset(Dataset):
|
|
75
75
|
"""
|
|
76
76
|
:ivar _context_mask: for each element, a mask indicating which
|
|
77
|
-
tokens are part of the context (
|
|
77
|
+
tokens are part of the context (0 for context, 1 for text on
|
|
78
78
|
which to perform inference). The mask allows to discard
|
|
79
79
|
predictions made for context at inference time, even though
|
|
80
80
|
the context can still be passed as input to the model.
|
|
@@ -92,11 +92,11 @@ class NERDataset(Dataset):
|
|
|
92
92
|
assert all(
|
|
93
93
|
[len(cm) == len(elt) for elt, cm in zip(self.elements, context_mask)]
|
|
94
94
|
)
|
|
95
|
-
self._context_mask = context_mask or [[
|
|
95
|
+
self._context_mask = context_mask or [[1] * len(elt) for elt in self.elements]
|
|
96
96
|
|
|
97
97
|
self.tokenizer = tokenizer
|
|
98
98
|
|
|
99
|
-
def __getitem__(self, index:
|
|
99
|
+
def __getitem__(self, index: int) -> BatchEncoding:
|
|
100
100
|
element = self.elements[index]
|
|
101
101
|
|
|
102
102
|
batch = self.tokenizer(
|
|
@@ -104,15 +104,18 @@ class NERDataset(Dataset):
|
|
|
104
104
|
truncation=True,
|
|
105
105
|
max_length=512, # TODO
|
|
106
106
|
is_split_into_words=True,
|
|
107
|
+
return_length=True,
|
|
107
108
|
)
|
|
108
109
|
|
|
109
|
-
batch["
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
batch["context_mask"]
|
|
110
|
+
length = batch["length"][0]
|
|
111
|
+
del batch["length"]
|
|
112
|
+
if self.tokenizer.truncation_side == "right":
|
|
113
|
+
batch["context_mask"] = self._context_mask[index][:length]
|
|
114
|
+
else:
|
|
115
|
+
assert self.tokenizer.truncation_side == "left"
|
|
116
|
+
batch["context_mask"] = self._context_mask[index][
|
|
117
|
+
len(batch["input_ids"]) - length :
|
|
118
|
+
]
|
|
116
119
|
|
|
117
120
|
return batch
|
|
118
121
|
|
|
@@ -181,6 +184,7 @@ def load_conll2002_bio(
|
|
|
181
184
|
path: str,
|
|
182
185
|
tag_conversion_map: Optional[Dict[str, str]] = None,
|
|
183
186
|
separator: str = "\t",
|
|
187
|
+
max_sent_len: Optional[int] = None,
|
|
184
188
|
**kwargs,
|
|
185
189
|
) -> Tuple[List[List[str]], List[str], List[NEREntity]]:
|
|
186
190
|
"""Load a file under CoNLL2022 BIO format. Sentences are expected
|
|
@@ -192,7 +196,9 @@ def load_conll2002_bio(
|
|
|
192
196
|
:param separator: separator between token and BIO tags
|
|
193
197
|
:param tag_conversion_map: conversion map for tags found in the
|
|
194
198
|
input file. Example : ``{'B': 'B-PER', 'I': 'I-PER'}``
|
|
195
|
-
:param
|
|
199
|
+
:param max_sent_len: if specified, maximum length, in tokens, of
|
|
200
|
+
sentences.
|
|
201
|
+
:param kwargs: additional kwargs for :func:`open` (such as
|
|
196
202
|
``encoding`` or ``newline``).
|
|
197
203
|
|
|
198
204
|
:return: ``(sentences, tokens, entities)``
|
|
@@ -207,7 +213,9 @@ def load_conll2002_bio(
|
|
|
207
213
|
tags = []
|
|
208
214
|
for line in raw_data.split("\n"):
|
|
209
215
|
line = line.strip("\n")
|
|
210
|
-
if re.fullmatch(r"\s*", line)
|
|
216
|
+
if re.fullmatch(r"\s*", line) or (
|
|
217
|
+
not max_sent_len is None and len(sent_tokens) >= max_sent_len
|
|
218
|
+
):
|
|
211
219
|
if len(sent_tokens) == 0:
|
|
212
220
|
continue
|
|
213
221
|
sents.append(sent_tokens)
|
|
@@ -227,6 +235,7 @@ def hgdataset_from_conll2002(
|
|
|
227
235
|
path: str,
|
|
228
236
|
tag_conversion_map: Optional[Dict[str, str]] = None,
|
|
229
237
|
separator: str = "\t",
|
|
238
|
+
max_sent_len: Optional[int] = None,
|
|
230
239
|
**kwargs,
|
|
231
240
|
) -> HGDataset:
|
|
232
241
|
"""Load a CoNLL-2002 file as a Huggingface Dataset.
|
|
@@ -234,12 +243,13 @@ def hgdataset_from_conll2002(
|
|
|
234
243
|
:param path: passed to :func:`.load_conll2002_bio`
|
|
235
244
|
:param tag_conversion_map: passed to :func:`load_conll2002_bio`
|
|
236
245
|
:param separator: passed to :func:`load_conll2002_bio`
|
|
237
|
-
:param
|
|
246
|
+
:param max_sent_len: passed to :func:`load_conll2002_bio`
|
|
247
|
+
:param kwargs: additional kwargs for :func:`open`
|
|
238
248
|
|
|
239
249
|
:return: a :class:`datasets.Dataset` with features 'tokens' and 'labels'.
|
|
240
250
|
"""
|
|
241
251
|
sentences, tokens, entities = load_conll2002_bio(
|
|
242
|
-
path, tag_conversion_map, separator, **kwargs
|
|
252
|
+
path, tag_conversion_map, separator, max_sent_len, **kwargs
|
|
243
253
|
)
|
|
244
254
|
|
|
245
255
|
# convert entities to labels
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from typing import Any, Dict, List, FrozenSet, Set, Optional, Tuple, Union, Literal
|
|
2
|
-
import
|
|
2
|
+
import re, sys
|
|
3
3
|
from itertools import combinations
|
|
4
4
|
from collections import defaultdict, Counter
|
|
5
5
|
from dataclasses import dataclass
|
|
@@ -11,6 +11,7 @@ from renard.pipeline.ner import NEREntity
|
|
|
11
11
|
from renard.pipeline.progress import ProgressReporter
|
|
12
12
|
from renard.resources.hypocorisms import HypocorismGazetteer
|
|
13
13
|
from renard.resources.pronouns import is_a_female_pronoun, is_a_male_pronoun
|
|
14
|
+
from renard.resources.determiners import singular_determiners
|
|
14
15
|
from renard.resources.titles import is_a_male_title, is_a_female_title, all_titles
|
|
15
16
|
|
|
16
17
|
|
|
@@ -61,6 +62,8 @@ def _assign_coreference_mentions(
|
|
|
61
62
|
# we assign each chain to the character with highest name
|
|
62
63
|
# occurence in it
|
|
63
64
|
for chain in corefs:
|
|
65
|
+
if len(char_mentions) == 0:
|
|
66
|
+
break
|
|
64
67
|
# determine the characters with the highest number of
|
|
65
68
|
# occurences
|
|
66
69
|
occ_counter = {}
|
|
@@ -98,8 +101,13 @@ class NaiveCharacterUnifier(PipelineStep):
|
|
|
98
101
|
character for it to be valid
|
|
99
102
|
"""
|
|
100
103
|
self.min_appearances = min_appearances
|
|
104
|
+
# a default value, will be est by _pipeline_init_
|
|
105
|
+
self.character_ner_tag = "PER"
|
|
101
106
|
super().__init__()
|
|
102
107
|
|
|
108
|
+
def _pipeline_init_(self, lang: str, character_ner_tag: str, **kwargs):
|
|
109
|
+
self.character_ner_tag = character_ner_tag
|
|
110
|
+
|
|
103
111
|
def __call__(
|
|
104
112
|
self,
|
|
105
113
|
text: str,
|
|
@@ -112,7 +120,7 @@ class NaiveCharacterUnifier(PipelineStep):
|
|
|
112
120
|
:param tokens:
|
|
113
121
|
:param entities:
|
|
114
122
|
"""
|
|
115
|
-
persons = [e for e in entities if e.tag ==
|
|
123
|
+
persons = [e for e in entities if e.tag == self.character_ner_tag]
|
|
116
124
|
|
|
117
125
|
characters = defaultdict(list)
|
|
118
126
|
for entity in persons:
|
|
@@ -159,6 +167,8 @@ class GraphRulesCharacterUnifier(PipelineStep):
|
|
|
159
167
|
min_appearances: int = 0,
|
|
160
168
|
additional_hypocorisms: Optional[List[Tuple[str, List[str]]]] = None,
|
|
161
169
|
link_corefs_mentions: bool = False,
|
|
170
|
+
ignore_lone_titles: Optional[Set[str]] = None,
|
|
171
|
+
ignore_leading_determiner: bool = False,
|
|
162
172
|
) -> None:
|
|
163
173
|
"""
|
|
164
174
|
:param min_appearances: minimum number of appearances of a
|
|
@@ -173,20 +183,32 @@ class GraphRulesCharacterUnifier(PipelineStep):
|
|
|
173
183
|
extract a lot of spurious links. However, linking by
|
|
174
184
|
coref is sometimes the only way to resolve a character
|
|
175
185
|
alias.
|
|
186
|
+
:param ignore_lone_titles: a set of titles to ignore when they
|
|
187
|
+
stand on their own. This avoids extracting false
|
|
188
|
+
positives characters such as 'Mr.' or 'Miss'.
|
|
189
|
+
:param ignore_leading_determiner: if ``True``, will ignore the
|
|
190
|
+
leading determiner when applying unification rules. This
|
|
191
|
+
is useful if the NER model used in the pipeline adds
|
|
192
|
+
leading determiners as part of entites.
|
|
176
193
|
"""
|
|
177
194
|
self.min_appearances = min_appearances
|
|
178
195
|
self.additional_hypocorisms = additional_hypocorisms
|
|
179
196
|
self.link_corefs_mentions = link_corefs_mentions
|
|
197
|
+
self.ignore_lone_titles = ignore_lone_titles or set()
|
|
198
|
+
self.character_ner_tag = "PER" # a default value, will be set by _pipeline_init
|
|
199
|
+
self.ignore_leading_determiner = ignore_leading_determiner
|
|
180
200
|
|
|
181
201
|
super().__init__()
|
|
182
202
|
|
|
183
|
-
def _pipeline_init_(self, lang: str,
|
|
203
|
+
def _pipeline_init_(self, lang: str, character_ner_tag: str, **kwargs):
|
|
184
204
|
self.hypocorism_gazetteer = HypocorismGazetteer(lang=lang)
|
|
185
205
|
if not self.additional_hypocorisms is None:
|
|
186
206
|
for name, nicknames in self.additional_hypocorisms:
|
|
187
207
|
self.hypocorism_gazetteer._add_hypocorism_(name, nicknames)
|
|
188
208
|
|
|
189
|
-
|
|
209
|
+
self.character_ner_tag = character_ner_tag
|
|
210
|
+
|
|
211
|
+
return super()._pipeline_init_(lang, **kwargs)
|
|
190
212
|
|
|
191
213
|
def __call__(
|
|
192
214
|
self,
|
|
@@ -196,12 +218,17 @@ class GraphRulesCharacterUnifier(PipelineStep):
|
|
|
196
218
|
) -> Dict[str, Any]:
|
|
197
219
|
import networkx as nx
|
|
198
220
|
|
|
199
|
-
mentions = [m for m in entities if m.tag ==
|
|
200
|
-
mentions_str =
|
|
221
|
+
mentions = [m for m in entities if m.tag == self.character_ner_tag]
|
|
222
|
+
mentions_str = set(
|
|
223
|
+
filter(
|
|
224
|
+
lambda m: not m in self.ignore_lone_titles,
|
|
225
|
+
map(lambda m: " ".join(m.tokens), mentions),
|
|
226
|
+
)
|
|
227
|
+
)
|
|
201
228
|
|
|
202
229
|
# * create a graph where each node is a mention detected by NER
|
|
203
230
|
G = nx.Graph()
|
|
204
|
-
for mention_str in
|
|
231
|
+
for mention_str in mentions_str:
|
|
205
232
|
G.add_node(mention_str)
|
|
206
233
|
|
|
207
234
|
# * HumanName local configuration - dependant on language
|
|
@@ -209,23 +236,28 @@ class GraphRulesCharacterUnifier(PipelineStep):
|
|
|
209
236
|
|
|
210
237
|
# * link nodes based on several rules
|
|
211
238
|
for name1, name2 in combinations(G.nodes(), 2):
|
|
239
|
+
|
|
240
|
+
# preprocess name when needed
|
|
241
|
+
pname1 = self._preprocess_name(name1)
|
|
242
|
+
pname2 = self._preprocess_name(name2)
|
|
243
|
+
|
|
212
244
|
# is one name a known hypocorism of the other ? (also
|
|
213
245
|
# checks if both names are the same)
|
|
214
|
-
if self.hypocorism_gazetteer.are_related(
|
|
246
|
+
if self.hypocorism_gazetteer.are_related(pname1, pname2):
|
|
215
247
|
G.add_edge(name1, name2)
|
|
216
248
|
continue
|
|
217
249
|
|
|
218
250
|
# if we remove the title, is one name related to the other
|
|
219
251
|
# ?
|
|
220
252
|
if self.names_are_related_after_title_removal(
|
|
221
|
-
|
|
253
|
+
pname1, pname2, hname_constants
|
|
222
254
|
):
|
|
223
255
|
G.add_edge(name1, name2)
|
|
224
256
|
continue
|
|
225
257
|
|
|
226
258
|
# add an edge if two characters have the same family names
|
|
227
|
-
human_name1 = HumanName(
|
|
228
|
-
human_name2 = HumanName(
|
|
259
|
+
human_name1 = HumanName(pname1, constants=hname_constants)
|
|
260
|
+
human_name2 = HumanName(pname2, constants=hname_constants)
|
|
229
261
|
if (
|
|
230
262
|
len(human_name1.last) > 0
|
|
231
263
|
and human_name1.last.lower() == human_name2.last.lower()
|
|
@@ -262,10 +294,15 @@ class GraphRulesCharacterUnifier(PipelineStep):
|
|
|
262
294
|
pass
|
|
263
295
|
|
|
264
296
|
for name1, name2 in combinations(G.nodes(), 2):
|
|
297
|
+
|
|
298
|
+
# preprocess names when needed
|
|
299
|
+
pname1 = self._preprocess_name(name1)
|
|
300
|
+
pname2 = self._preprocess_name(name2)
|
|
301
|
+
|
|
265
302
|
# check if characters have the same last name but a
|
|
266
303
|
# different first name.
|
|
267
|
-
human_name1 = HumanName(
|
|
268
|
-
human_name2 = HumanName(
|
|
304
|
+
human_name1 = HumanName(pname1, constants=hname_constants)
|
|
305
|
+
human_name2 = HumanName(pname2, constants=hname_constants)
|
|
269
306
|
if (
|
|
270
307
|
len(human_name1.last) > 0
|
|
271
308
|
and len(human_name2.last) > 0
|
|
@@ -317,6 +354,17 @@ class GraphRulesCharacterUnifier(PipelineStep):
|
|
|
317
354
|
|
|
318
355
|
return {"characters": characters}
|
|
319
356
|
|
|
357
|
+
def _preprocess_name(self, name) -> str:
|
|
358
|
+
if self.ignore_leading_determiner:
|
|
359
|
+
if not self.lang in singular_determiners:
|
|
360
|
+
print(
|
|
361
|
+
f"[warning] can't ignore leading determiners for {self.lang}",
|
|
362
|
+
file=sys.stderr,
|
|
363
|
+
)
|
|
364
|
+
for determiner in singular_determiners.get(self.lang, []):
|
|
365
|
+
name = re.sub(f"^{determiner} ", " ", name, flags=re.I)
|
|
366
|
+
return name
|
|
367
|
+
|
|
320
368
|
def _make_hname_constants(self) -> Constants:
|
|
321
369
|
if self.lang == "eng":
|
|
322
370
|
return Constants()
|
|
@@ -345,13 +393,18 @@ class GraphRulesCharacterUnifier(PipelineStep):
|
|
|
345
393
|
or self.hypocorism_gazetteer.are_related(raw_name1, raw_name2)
|
|
346
394
|
)
|
|
347
395
|
|
|
348
|
-
def names_are_in_coref(
|
|
396
|
+
def names_are_in_coref(
|
|
397
|
+
self, name1: str, name2: str, corefs: List[List[Mention]]
|
|
398
|
+
) -> bool:
|
|
399
|
+
once_together = False
|
|
349
400
|
for coref_chain in corefs:
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
):
|
|
353
|
-
return
|
|
354
|
-
|
|
401
|
+
name1_in = any([name1 == " ".join(m.tokens) for m in coref_chain])
|
|
402
|
+
name2_in = any([name2 == " ".join(m.tokens) for m in coref_chain])
|
|
403
|
+
if name1_in == (not name2_in):
|
|
404
|
+
return False
|
|
405
|
+
elif name1_in and name2_in:
|
|
406
|
+
once_together = True
|
|
407
|
+
return once_together
|
|
355
408
|
|
|
356
409
|
def infer_name_gender(
|
|
357
410
|
self,
|
|
@@ -1,7 +1,9 @@
|
|
|
1
|
+
import sys
|
|
1
2
|
import renard.pipeline.character_unification as cu
|
|
2
3
|
|
|
3
4
|
print(
|
|
4
|
-
"[warning] the characters_extraction module is deprecated. Use character_unification instead."
|
|
5
|
+
"[warning] the characters_extraction module is deprecated. Use character_unification instead.",
|
|
6
|
+
file=sys.stderr,
|
|
5
7
|
)
|
|
6
8
|
|
|
7
9
|
Character = cu.Character
|