renard-pipeline 0.4.0__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of renard-pipeline might be problematic. Click here for more details.
- {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/PKG-INFO +58 -4
- renard_pipeline-0.5.0/README.md +89 -0
- {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/pyproject.toml +5 -2
- {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/graph_utils.py +11 -4
- {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/ner_utils.py +4 -0
- {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/pipeline/character_unification.py +26 -6
- {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/pipeline/characters_extraction.py +3 -1
- {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/pipeline/core.py +127 -25
- {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/pipeline/corefs/corefs.py +30 -31
- renard_pipeline-0.5.0/renard/pipeline/graph_extraction.py +604 -0
- {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/pipeline/ner.py +3 -2
- {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/pipeline/progress.py +32 -1
- {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/pipeline/speaker_attribution.py +2 -3
- renard_pipeline-0.5.0/renard/pipeline/tokenization.py +84 -0
- renard_pipeline-0.5.0/renard/plot_utils.py +80 -0
- {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/resources/hypocorisms/hypocorisms.py +3 -2
- {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/utils.py +57 -1
- renard_pipeline-0.4.0/README.md +0 -38
- renard_pipeline-0.4.0/renard/pipeline/graph_extraction.py +0 -515
- renard_pipeline-0.4.0/renard/pipeline/tokenization.py +0 -55
- renard_pipeline-0.4.0/renard/plot_utils.py +0 -67
- {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/LICENSE +0 -0
- {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/gender.py +0 -0
- {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/nltk_utils.py +0 -0
- {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/pipeline/__init__.py +0 -0
- {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/pipeline/corefs/__init__.py +0 -0
- {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/pipeline/preconfigured.py +0 -0
- {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/pipeline/preprocessing.py +0 -0
- {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/pipeline/quote_detection.py +0 -0
- {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/pipeline/sentiment_analysis.py +0 -0
- {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/pipeline/stanford_corenlp.py +0 -0
- {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/py.typed +0 -0
- {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/resources/hypocorisms/__init__.py +0 -0
- {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/resources/hypocorisms/datas/License.txt +0 -0
- {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/resources/hypocorisms/datas/hypocorisms.csv +0 -0
- {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/resources/pronouns/__init__.py +0 -0
- {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/resources/pronouns/pronouns.py +0 -0
- {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/resources/titles/__init__.py +0 -0
- {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/resources/titles/titles.py +0 -0
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: renard-pipeline
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: Relationships Extraction from NARrative Documents
|
|
5
|
+
Home-page: https://github.com/CompNet/Renard
|
|
5
6
|
License: GPL-3.0-only
|
|
6
7
|
Author: Arthur Amalvy
|
|
7
8
|
Author-email: arthur.amalvy@univ-avignon.fr
|
|
@@ -27,17 +28,21 @@ Requires-Dist: seqeval (==1.2.2)
|
|
|
27
28
|
Requires-Dist: spacy (>=3.5.0,<4.0.0) ; extra == "spacy"
|
|
28
29
|
Requires-Dist: spacy-transformers (>=1.2.1,<2.0.0) ; extra == "spacy"
|
|
29
30
|
Requires-Dist: stanza (>=1.3.0,<2.0.0) ; extra == "stanza"
|
|
30
|
-
Requires-Dist: tibert (>=0.
|
|
31
|
+
Requires-Dist: tibert (>=0.4.0,<0.5.0)
|
|
31
32
|
Requires-Dist: torch (>=2.0.0,!=2.0.1)
|
|
32
33
|
Requires-Dist: tqdm (>=4.62.3,<5.0.0)
|
|
33
34
|
Requires-Dist: transformers (>=4.36.0,<5.0.0)
|
|
35
|
+
Project-URL: Documentation, https://compnet.github.io/Renard/
|
|
36
|
+
Project-URL: Repository, https://github.com/CompNet/Renard
|
|
34
37
|
Description-Content-Type: text/markdown
|
|
35
38
|
|
|
36
39
|
# Renard
|
|
37
40
|
|
|
41
|
+
[](https://doi.org/10.21105/joss.06574)
|
|
42
|
+
|
|
38
43
|
Renard (Relationships Extraction from NARrative Documents) is a library for creating and using custom character networks extraction pipelines. Renard can extract dynamic as well as static character networks.
|
|
39
44
|
|
|
40
|
-

|
|
41
46
|
|
|
42
47
|
|
|
43
48
|
# Installation
|
|
@@ -46,6 +51,8 @@ You can install the latest version using pip:
|
|
|
46
51
|
|
|
47
52
|
> pip install renard-pipeline
|
|
48
53
|
|
|
54
|
+
Currently, Renard supports Python 3.8, 3.9 and 3.10.
|
|
55
|
+
|
|
49
56
|
|
|
50
57
|
# Documentation
|
|
51
58
|
|
|
@@ -56,7 +63,32 @@ If you need local documentation, it can be generated using `Sphinx`. From the `d
|
|
|
56
63
|
|
|
57
64
|
# Tutorial
|
|
58
65
|
|
|
59
|
-
|
|
66
|
+
Renard's central concept is the `Pipeline`.A `Pipeline` is a list of `PipelineStep` that are run sequentially in order to extract a character graph from a document. Here is a simple example:
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
from renard.pipeline import Pipeline
|
|
70
|
+
from renard.pipeline.tokenization import NLTKTokenizer
|
|
71
|
+
from renard.pipeline.ner import NLTKNamedEntityRecognizer
|
|
72
|
+
from renard.pipeline.character_unification import GraphRulesCharacterUnifier
|
|
73
|
+
from renard.pipeline.graph_extraction import CoOccurrencesGraphExtractor
|
|
74
|
+
|
|
75
|
+
with open("./my_doc.txt") as f:
|
|
76
|
+
text = f.read()
|
|
77
|
+
|
|
78
|
+
pipeline = Pipeline(
|
|
79
|
+
[
|
|
80
|
+
NLTKTokenizer(),
|
|
81
|
+
NLTKNamedEntityRecognizer(),
|
|
82
|
+
GraphRulesCharacterUnifier(min_appearance=10),
|
|
83
|
+
CoOccurrencesGraphExtractor(co_occurrences_dist=25)
|
|
84
|
+
]
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
out = pipeline(text)
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
For more information, see `renard_tutorial.py`, which is a tutorial in the `jupytext` format. You can open it as a notebook in Jupyter Notebook (or export it as a notebook with `jupytext --to ipynb renard-tutorial.py`).
|
|
91
|
+
|
|
60
92
|
|
|
61
93
|
|
|
62
94
|
# Running tests
|
|
@@ -72,3 +104,25 @@ Expensive tests are disabled by default. These can be run by setting the environ
|
|
|
72
104
|
|
|
73
105
|
see [the "Contributing" section of the documentation](https://compnet.github.io/Renard/contributing.html).
|
|
74
106
|
|
|
107
|
+
|
|
108
|
+
# How to cite
|
|
109
|
+
|
|
110
|
+
If you use Renard in your research project, please cite it as follows:
|
|
111
|
+
|
|
112
|
+
```bibtex
|
|
113
|
+
@Article{Amalvy2024,
|
|
114
|
+
doi = {10.21105/joss.06574},
|
|
115
|
+
year = {2024},
|
|
116
|
+
publisher = {The Open Journal},
|
|
117
|
+
volume = {9},
|
|
118
|
+
number = {98},
|
|
119
|
+
pages = {6574},
|
|
120
|
+
author = {Amalvy, A. and Labatut, V. and Dufour, R.},
|
|
121
|
+
title = {Renard: A Modular Pipeline for Extracting Character
|
|
122
|
+
Networks from Narrative Texts},
|
|
123
|
+
journal = {Journal of Open Source Software},
|
|
124
|
+
}
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
We would be happy to hear about your usage of Renard, so don't hesitate to reach out!
|
|
128
|
+
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# Renard
|
|
2
|
+
|
|
3
|
+
[](https://doi.org/10.21105/joss.06574)
|
|
4
|
+
|
|
5
|
+
Renard (Relationships Extraction from NARrative Documents) is a library for creating and using custom character networks extraction pipelines. Renard can extract dynamic as well as static character networks.
|
|
6
|
+
|
|
7
|
+

|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# Installation
|
|
11
|
+
|
|
12
|
+
You can install the latest version using pip:
|
|
13
|
+
|
|
14
|
+
> pip install renard-pipeline
|
|
15
|
+
|
|
16
|
+
Currently, Renard supports Python 3.8, 3.9 and 3.10.
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# Documentation
|
|
20
|
+
|
|
21
|
+
Documentation, including installation instructions, can be found at https://compnet.github.io/Renard/
|
|
22
|
+
|
|
23
|
+
If you need local documentation, it can be generated using `Sphinx`. From the `docs` directory, `make html` should create documentation under `docs/_build/html`.
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# Tutorial
|
|
27
|
+
|
|
28
|
+
Renard's central concept is the `Pipeline`.A `Pipeline` is a list of `PipelineStep` that are run sequentially in order to extract a character graph from a document. Here is a simple example:
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
from renard.pipeline import Pipeline
|
|
32
|
+
from renard.pipeline.tokenization import NLTKTokenizer
|
|
33
|
+
from renard.pipeline.ner import NLTKNamedEntityRecognizer
|
|
34
|
+
from renard.pipeline.character_unification import GraphRulesCharacterUnifier
|
|
35
|
+
from renard.pipeline.graph_extraction import CoOccurrencesGraphExtractor
|
|
36
|
+
|
|
37
|
+
with open("./my_doc.txt") as f:
|
|
38
|
+
text = f.read()
|
|
39
|
+
|
|
40
|
+
pipeline = Pipeline(
|
|
41
|
+
[
|
|
42
|
+
NLTKTokenizer(),
|
|
43
|
+
NLTKNamedEntityRecognizer(),
|
|
44
|
+
GraphRulesCharacterUnifier(min_appearance=10),
|
|
45
|
+
CoOccurrencesGraphExtractor(co_occurrences_dist=25)
|
|
46
|
+
]
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
out = pipeline(text)
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
For more information, see `renard_tutorial.py`, which is a tutorial in the `jupytext` format. You can open it as a notebook in Jupyter Notebook (or export it as a notebook with `jupytext --to ipynb renard-tutorial.py`).
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# Running tests
|
|
57
|
+
|
|
58
|
+
`Renard` uses `pytest` for testing. To launch tests, use the following command :
|
|
59
|
+
|
|
60
|
+
> poetry run python -m pytest tests
|
|
61
|
+
|
|
62
|
+
Expensive tests are disabled by default. These can be run by setting the environment variable `RENARD_TEST_ALL` to `1`.
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# Contributing
|
|
66
|
+
|
|
67
|
+
see [the "Contributing" section of the documentation](https://compnet.github.io/Renard/contributing.html).
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# How to cite
|
|
71
|
+
|
|
72
|
+
If you use Renard in your research project, please cite it as follows:
|
|
73
|
+
|
|
74
|
+
```bibtex
|
|
75
|
+
@Article{Amalvy2024,
|
|
76
|
+
doi = {10.21105/joss.06574},
|
|
77
|
+
year = {2024},
|
|
78
|
+
publisher = {The Open Journal},
|
|
79
|
+
volume = {9},
|
|
80
|
+
number = {98},
|
|
81
|
+
pages = {6574},
|
|
82
|
+
author = {Amalvy, A. and Labatut, V. and Dufour, R.},
|
|
83
|
+
title = {Renard: A Modular Pipeline for Extracting Character
|
|
84
|
+
Networks from Narrative Texts},
|
|
85
|
+
journal = {Journal of Open Source Software},
|
|
86
|
+
}
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
We would be happy to hear about your usage of Renard, so don't hesitate to reach out!
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "renard-pipeline"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.5.0"
|
|
4
4
|
description = "Relationships Extraction from NARrative Documents"
|
|
5
5
|
authors = ["Arthur Amalvy <arthur.amalvy@univ-avignon.fr>"]
|
|
6
6
|
license = "GPL-3.0-only"
|
|
@@ -8,6 +8,9 @@ readme = "README.md"
|
|
|
8
8
|
packages = [
|
|
9
9
|
{ include = "renard" }
|
|
10
10
|
]
|
|
11
|
+
homepage = "https://github.com/CompNet/Renard"
|
|
12
|
+
repository = "https://github.com/CompNet/Renard"
|
|
13
|
+
documentation = "https://compnet.github.io/Renard/"
|
|
11
14
|
|
|
12
15
|
[tool.poetry.dependencies]
|
|
13
16
|
# optional dependencies
|
|
@@ -28,7 +31,7 @@ matplotlib = "^3.5.3"
|
|
|
28
31
|
seqeval = "1.2.2"
|
|
29
32
|
pandas = "^2.0.0"
|
|
30
33
|
pytest = "^7.2.1"
|
|
31
|
-
tibert = "^0.
|
|
34
|
+
tibert = "^0.4.0"
|
|
32
35
|
grimbert = "^0.1.0"
|
|
33
36
|
datasets = "^2.16.1"
|
|
34
37
|
|
|
@@ -70,10 +70,17 @@ def graph_with_names(
|
|
|
70
70
|
else:
|
|
71
71
|
name_style_fn = name_style
|
|
72
72
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
73
|
+
mapping = {}
|
|
74
|
+
for character in G.nodes():
|
|
75
|
+
# NOTE: it is *possible* to have a graph where nodes are not
|
|
76
|
+
# characters (for example, simple strings). Therefore, we are
|
|
77
|
+
# lenient here
|
|
78
|
+
try:
|
|
79
|
+
mapping[character] = name_style_fn(character)
|
|
80
|
+
except AttributeError:
|
|
81
|
+
mapping[character] = character
|
|
82
|
+
|
|
83
|
+
return nx.relabel_nodes(G, mapping)
|
|
77
84
|
|
|
78
85
|
|
|
79
86
|
def layout_with_names(
|
|
@@ -110,6 +110,10 @@ class NERDataset(Dataset):
|
|
|
110
110
|
elt_context_mask = self._context_mask[index]
|
|
111
111
|
for i in range(len(element)):
|
|
112
112
|
w2t = batch.word_to_tokens(0, i)
|
|
113
|
+
# w2t can be None in case of truncation, which can happen
|
|
114
|
+
# if `element' is too long
|
|
115
|
+
if w2t is None:
|
|
116
|
+
continue
|
|
113
117
|
mask_value = elt_context_mask[i]
|
|
114
118
|
tokens_mask = [mask_value] * (w2t.end - w2t.start)
|
|
115
119
|
batch["context_mask"][w2t.start : w2t.end] = tokens_mask
|
|
@@ -61,6 +61,8 @@ def _assign_coreference_mentions(
|
|
|
61
61
|
# we assign each chain to the character with highest name
|
|
62
62
|
# occurence in it
|
|
63
63
|
for chain in corefs:
|
|
64
|
+
if len(char_mentions) == 0:
|
|
65
|
+
break
|
|
64
66
|
# determine the characters with the highest number of
|
|
65
67
|
# occurences
|
|
66
68
|
occ_counter = {}
|
|
@@ -98,8 +100,13 @@ class NaiveCharacterUnifier(PipelineStep):
|
|
|
98
100
|
character for it to be valid
|
|
99
101
|
"""
|
|
100
102
|
self.min_appearances = min_appearances
|
|
103
|
+
# a default value, will be est by _pipeline_init_
|
|
104
|
+
self.character_ner_tag = "PER"
|
|
101
105
|
super().__init__()
|
|
102
106
|
|
|
107
|
+
def _pipeline_init_(self, lang: str, character_ner_tag: str, **kwargs):
|
|
108
|
+
self.character_ner_tag = character_ner_tag
|
|
109
|
+
|
|
103
110
|
def __call__(
|
|
104
111
|
self,
|
|
105
112
|
text: str,
|
|
@@ -112,7 +119,7 @@ class NaiveCharacterUnifier(PipelineStep):
|
|
|
112
119
|
:param tokens:
|
|
113
120
|
:param entities:
|
|
114
121
|
"""
|
|
115
|
-
persons = [e for e in entities if e.tag ==
|
|
122
|
+
persons = [e for e in entities if e.tag == self.character_ner_tag]
|
|
116
123
|
|
|
117
124
|
characters = defaultdict(list)
|
|
118
125
|
for entity in persons:
|
|
@@ -159,6 +166,7 @@ class GraphRulesCharacterUnifier(PipelineStep):
|
|
|
159
166
|
min_appearances: int = 0,
|
|
160
167
|
additional_hypocorisms: Optional[List[Tuple[str, List[str]]]] = None,
|
|
161
168
|
link_corefs_mentions: bool = False,
|
|
169
|
+
ignore_lone_titles: Optional[Set[str]] = None,
|
|
162
170
|
) -> None:
|
|
163
171
|
"""
|
|
164
172
|
:param min_appearances: minimum number of appearances of a
|
|
@@ -173,20 +181,27 @@ class GraphRulesCharacterUnifier(PipelineStep):
|
|
|
173
181
|
extract a lot of spurious links. However, linking by
|
|
174
182
|
coref is sometimes the only way to resolve a character
|
|
175
183
|
alias.
|
|
184
|
+
:param ignore_lone_titles: a set of titles to ignore when
|
|
185
|
+
they stand on their own. This avoids extracting false
|
|
186
|
+
positives characters such as 'Mr.' or 'Miss'.
|
|
176
187
|
"""
|
|
177
188
|
self.min_appearances = min_appearances
|
|
178
189
|
self.additional_hypocorisms = additional_hypocorisms
|
|
179
190
|
self.link_corefs_mentions = link_corefs_mentions
|
|
191
|
+
self.ignore_lone_titles = ignore_lone_titles or set()
|
|
192
|
+
self.character_ner_tag = "PER" # a default value, will be set by _pipeline_init
|
|
180
193
|
|
|
181
194
|
super().__init__()
|
|
182
195
|
|
|
183
|
-
def _pipeline_init_(self, lang: str,
|
|
196
|
+
def _pipeline_init_(self, lang: str, character_ner_tag: str, **kwargs):
|
|
184
197
|
self.hypocorism_gazetteer = HypocorismGazetteer(lang=lang)
|
|
185
198
|
if not self.additional_hypocorisms is None:
|
|
186
199
|
for name, nicknames in self.additional_hypocorisms:
|
|
187
200
|
self.hypocorism_gazetteer._add_hypocorism_(name, nicknames)
|
|
188
201
|
|
|
189
|
-
|
|
202
|
+
self.character_ner_tag = character_ner_tag
|
|
203
|
+
|
|
204
|
+
return super()._pipeline_init_(lang, **kwargs)
|
|
190
205
|
|
|
191
206
|
def __call__(
|
|
192
207
|
self,
|
|
@@ -196,12 +211,17 @@ class GraphRulesCharacterUnifier(PipelineStep):
|
|
|
196
211
|
) -> Dict[str, Any]:
|
|
197
212
|
import networkx as nx
|
|
198
213
|
|
|
199
|
-
mentions = [m for m in entities if m.tag ==
|
|
200
|
-
mentions_str =
|
|
214
|
+
mentions = [m for m in entities if m.tag == self.character_ner_tag]
|
|
215
|
+
mentions_str = set(
|
|
216
|
+
filter(
|
|
217
|
+
lambda m: not m in self.ignore_lone_titles,
|
|
218
|
+
map(lambda m: " ".join(m.tokens), mentions),
|
|
219
|
+
)
|
|
220
|
+
)
|
|
201
221
|
|
|
202
222
|
# * create a graph where each node is a mention detected by NER
|
|
203
223
|
G = nx.Graph()
|
|
204
|
-
for mention_str in
|
|
224
|
+
for mention_str in mentions_str:
|
|
205
225
|
G.add_node(mention_str)
|
|
206
226
|
|
|
207
227
|
# * HumanName local configuration - dependant on language
|
|
@@ -1,7 +1,9 @@
|
|
|
1
|
+
import sys
|
|
1
2
|
import renard.pipeline.character_unification as cu
|
|
2
3
|
|
|
3
4
|
print(
|
|
4
|
-
"[warning] the characters_extraction module is deprecated. Use character_unification instead."
|
|
5
|
+
"[warning] the characters_extraction module is deprecated. Use character_unification instead.",
|
|
6
|
+
file=sys.stderr,
|
|
5
7
|
)
|
|
6
8
|
|
|
7
9
|
Character = cu.Character
|
|
@@ -50,6 +50,13 @@ class Mention:
|
|
|
50
50
|
self_dict["end_idx"] = self.end_idx + shift
|
|
51
51
|
return self.__class__(**self_dict)
|
|
52
52
|
|
|
53
|
+
def __eq__(self, other: Mention) -> bool:
|
|
54
|
+
return (
|
|
55
|
+
self.tokens == other.tokens
|
|
56
|
+
and self.start_idx == other.start_idx
|
|
57
|
+
and self.end_idx == other.end_idx
|
|
58
|
+
)
|
|
59
|
+
|
|
53
60
|
def __hash__(self) -> int:
|
|
54
61
|
return hash(tuple(self.tokens) + (self.start_idx, self.end_idx))
|
|
55
62
|
|
|
@@ -72,11 +79,18 @@ class PipelineStep:
|
|
|
72
79
|
"""Initialize the :class:`PipelineStep` with a given configuration."""
|
|
73
80
|
pass
|
|
74
81
|
|
|
75
|
-
def _pipeline_init_(
|
|
76
|
-
|
|
82
|
+
def _pipeline_init_(
|
|
83
|
+
self, lang: str, progress_reporter: ProgressReporter, **kwargs
|
|
84
|
+
) -> Optional[Dict[Pipeline.PipelineParameter, Any]]:
|
|
85
|
+
"""Set the step configuration that is common to the whole
|
|
86
|
+
pipeline.
|
|
77
87
|
|
|
78
|
-
:param lang:
|
|
79
|
-
:param
|
|
88
|
+
:param lang: the lang of the whole pipeline
|
|
89
|
+
:param progress_reporter:
|
|
90
|
+
:param kwargs: additional pipeline parameters.
|
|
91
|
+
|
|
92
|
+
:return: a step can return a dictionary of pipeline params if
|
|
93
|
+
it wish to modify some of these.
|
|
80
94
|
"""
|
|
81
95
|
supported_langs = self.supported_langs()
|
|
82
96
|
if not supported_langs == "any" and not lang in supported_langs:
|
|
@@ -143,13 +157,14 @@ class PipelineState:
|
|
|
143
157
|
#: input text
|
|
144
158
|
text: Optional[str]
|
|
145
159
|
|
|
146
|
-
#: text split into
|
|
147
|
-
|
|
160
|
+
#: text split into blocks of texts. When dynamic blocks are given,
|
|
161
|
+
#: the final network is dynamic, and split according to blocks.
|
|
162
|
+
dynamic_blocks: Optional[List[Tuple[int, int]]] = None
|
|
148
163
|
|
|
149
164
|
#: text splitted in tokens
|
|
150
165
|
tokens: Optional[List[str]] = None
|
|
151
|
-
#:
|
|
152
|
-
|
|
166
|
+
#: mapping from a character to its corresponding token
|
|
167
|
+
char2token: Optional[List[int]] = None
|
|
153
168
|
#: text splitted into sentences, each sentence being a list of
|
|
154
169
|
#: tokens
|
|
155
170
|
sentences: Optional[List[List[str]]] = None
|
|
@@ -175,14 +190,12 @@ class PipelineState:
|
|
|
175
190
|
#: network)
|
|
176
191
|
character_network: Optional[Union[List[nx.Graph], nx.Graph]] = None
|
|
177
192
|
|
|
193
|
+
# aliases of self.character_network
|
|
178
194
|
def get_characters_graph(self) -> Optional[Union[List[nx.Graph], nx.Graph]]:
|
|
179
|
-
print(
|
|
180
|
-
"[warning] the characters_graph attribute is deprecated, use character_network instead",
|
|
181
|
-
file=sys.stderr,
|
|
182
|
-
)
|
|
183
195
|
return self.character_network
|
|
184
196
|
|
|
185
197
|
characters_graph = property(get_characters_graph)
|
|
198
|
+
character_graph = property(get_characters_graph)
|
|
186
199
|
|
|
187
200
|
def get_character(
|
|
188
201
|
self, name: str, partial_match: bool = True
|
|
@@ -273,6 +286,9 @@ class PipelineState:
|
|
|
273
286
|
cumulative: bool = False,
|
|
274
287
|
stable_layout: bool = False,
|
|
275
288
|
layout: Optional[CharactersGraphLayout] = None,
|
|
289
|
+
node_kwargs: Optional[List[Dict[str, Any]]] = None,
|
|
290
|
+
edge_kwargs: Optional[List[Dict[str, Any]]] = None,
|
|
291
|
+
label_kwargs: Optional[List[Dict[str, Any]]] = None,
|
|
276
292
|
):
|
|
277
293
|
"""Plot ``self.character_graph`` using reasonable default
|
|
278
294
|
parameters, and save the produced figures in the specified
|
|
@@ -287,6 +303,9 @@ class PipelineState:
|
|
|
287
303
|
timestep. Characters' positions are based on the final
|
|
288
304
|
cumulative graph layout.
|
|
289
305
|
:param layout: pre-computed graph layout
|
|
306
|
+
:param node_kwargs: passed to :func:`nx.draw_networkx_nodes`
|
|
307
|
+
:param edge_kwargs: passed to :func:`nx.draw_networkx_nodes`
|
|
308
|
+
:param label_kwargs: passed to :func:`nx.draw_networkx_labels`
|
|
290
309
|
"""
|
|
291
310
|
import matplotlib.pyplot as plt
|
|
292
311
|
|
|
@@ -310,13 +329,24 @@ class PipelineState:
|
|
|
310
329
|
)
|
|
311
330
|
layout = layout_nx_graph_reasonably(layout_graph)
|
|
312
331
|
|
|
332
|
+
node_kwargs = node_kwargs or [{} for _ in range(len(self.character_network))]
|
|
333
|
+
edge_kwargs = edge_kwargs or [{} for _ in range(len(self.character_network))]
|
|
334
|
+
label_kwargs = label_kwargs or [{} for _ in range(len(self.character_network))]
|
|
335
|
+
|
|
313
336
|
for i, G in enumerate(graphs):
|
|
314
337
|
_, ax = plt.subplots()
|
|
315
338
|
local_layout = layout
|
|
316
339
|
if not local_layout is None:
|
|
317
340
|
local_layout = layout_with_names(G, local_layout, name_style)
|
|
318
341
|
G = graph_with_names(G, name_style=name_style)
|
|
319
|
-
plot_nx_graph_reasonably(
|
|
342
|
+
plot_nx_graph_reasonably(
|
|
343
|
+
G,
|
|
344
|
+
ax=ax,
|
|
345
|
+
layout=local_layout,
|
|
346
|
+
node_kwargs=node_kwargs[i],
|
|
347
|
+
edge_kwargs=edge_kwargs[i],
|
|
348
|
+
label_kwargs=label_kwargs[i],
|
|
349
|
+
)
|
|
320
350
|
plt.savefig(f"{directory}/{i}.png")
|
|
321
351
|
plt.close()
|
|
322
352
|
|
|
@@ -328,6 +358,9 @@ class PipelineState:
|
|
|
328
358
|
] = "most_frequent",
|
|
329
359
|
layout: Optional[CharactersGraphLayout] = None,
|
|
330
360
|
fig: Optional[plt.Figure] = None,
|
|
361
|
+
node_kwargs: Optional[Dict[str, Any]] = None,
|
|
362
|
+
edge_kwargs: Optional[Dict[str, Any]] = None,
|
|
363
|
+
label_kwargs: Optional[Dict[str, Any]] = None,
|
|
331
364
|
):
|
|
332
365
|
"""Plot ``self.character_graph`` using reasonable parameters,
|
|
333
366
|
and save the produced figure to a file
|
|
@@ -337,6 +370,9 @@ class PipelineState:
|
|
|
337
370
|
:param layout: pre-computed graph layout
|
|
338
371
|
:param fig: if specified, this matplotlib figure will be used
|
|
339
372
|
for plotting
|
|
373
|
+
:param node_kwargs: passed to :func:`nx.draw_networkx_nodes`
|
|
374
|
+
:param edge_kwargs: passed to :func:`nx.draw_networkx_nodes`
|
|
375
|
+
:param label_kwargs: passed to :func:`nx.draw_networkx_labels`
|
|
340
376
|
"""
|
|
341
377
|
import matplotlib.pyplot as plt
|
|
342
378
|
|
|
@@ -354,7 +390,14 @@ class PipelineState:
|
|
|
354
390
|
fig.set_dpi(300)
|
|
355
391
|
fig.set_size_inches(24, 24)
|
|
356
392
|
ax = fig.add_subplot(111)
|
|
357
|
-
plot_nx_graph_reasonably(
|
|
393
|
+
plot_nx_graph_reasonably(
|
|
394
|
+
G,
|
|
395
|
+
ax=ax,
|
|
396
|
+
layout=layout,
|
|
397
|
+
node_kwargs=node_kwargs,
|
|
398
|
+
edge_kwargs=edge_kwargs,
|
|
399
|
+
label_kwargs=label_kwargs,
|
|
400
|
+
)
|
|
358
401
|
plt.savefig(path)
|
|
359
402
|
plt.close()
|
|
360
403
|
|
|
@@ -368,6 +411,9 @@ class PipelineState:
|
|
|
368
411
|
graph_start_idx: int = 1,
|
|
369
412
|
stable_layout: bool = False,
|
|
370
413
|
layout: Optional[CharactersGraphLayout] = None,
|
|
414
|
+
node_kwargs: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
|
415
|
+
edge_kwargs: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
|
416
|
+
label_kwargs: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
|
371
417
|
):
|
|
372
418
|
"""Plot ``self.character_network`` using reasonable default
|
|
373
419
|
parameters
|
|
@@ -393,6 +439,9 @@ class PipelineState:
|
|
|
393
439
|
same position in space at each timestep. Characters'
|
|
394
440
|
positions are based on the final cumulative graph layout.
|
|
395
441
|
:param layout: pre-computed graph layout
|
|
442
|
+
:param node_kwargs: passed to :func:`nx.draw_networkx_nodes`
|
|
443
|
+
:param edge_kwargs: passed to :func:`nx.draw_networkx_nodes`
|
|
444
|
+
:param label_kwargs: passed to :func:`nx.draw_networkx_labels`
|
|
396
445
|
"""
|
|
397
446
|
import matplotlib.pyplot as plt
|
|
398
447
|
from matplotlib.widgets import Slider
|
|
@@ -411,13 +460,30 @@ class PipelineState:
|
|
|
411
460
|
fig.set_dpi(300)
|
|
412
461
|
fig.set_size_inches(24, 24)
|
|
413
462
|
ax = fig.add_subplot(111)
|
|
414
|
-
|
|
463
|
+
assert not isinstance(node_kwargs, list)
|
|
464
|
+
assert not isinstance(edge_kwargs, list)
|
|
465
|
+
assert not isinstance(label_kwargs, list)
|
|
466
|
+
plot_nx_graph_reasonably(
|
|
467
|
+
G,
|
|
468
|
+
ax=ax,
|
|
469
|
+
layout=layout,
|
|
470
|
+
node_kwargs=node_kwargs,
|
|
471
|
+
edge_kwargs=edge_kwargs,
|
|
472
|
+
label_kwargs=label_kwargs,
|
|
473
|
+
)
|
|
415
474
|
return
|
|
416
475
|
|
|
417
476
|
if not isinstance(self.character_network, list):
|
|
418
477
|
raise TypeError
|
|
419
478
|
# self.character_network is a list: plot a dynamic graph
|
|
420
479
|
|
|
480
|
+
node_kwargs = node_kwargs or [{} for _ in range(len(self.character_network))]
|
|
481
|
+
assert isinstance(node_kwargs, list)
|
|
482
|
+
edge_kwargs = edge_kwargs or [{} for _ in range(len(self.character_network))]
|
|
483
|
+
assert isinstance(edge_kwargs, list)
|
|
484
|
+
label_kwargs = label_kwargs or [{} for _ in range(len(self.character_network))]
|
|
485
|
+
assert isinstance(label_kwargs, list)
|
|
486
|
+
|
|
421
487
|
if fig is None:
|
|
422
488
|
fig, ax = plt.subplots()
|
|
423
489
|
assert not fig is None
|
|
@@ -433,12 +499,13 @@ class PipelineState:
|
|
|
433
499
|
|
|
434
500
|
def update(slider_value):
|
|
435
501
|
assert isinstance(self.character_network, list)
|
|
502
|
+
slider_i = int(slider_value) - 1
|
|
436
503
|
|
|
437
504
|
character_networks = self.character_network
|
|
438
505
|
if cumulative:
|
|
439
506
|
character_networks = cumulative_character_networks
|
|
440
507
|
|
|
441
|
-
G = character_networks[
|
|
508
|
+
G = character_networks[slider_i]
|
|
442
509
|
|
|
443
510
|
local_layout = layout
|
|
444
511
|
if not local_layout is None:
|
|
@@ -446,7 +513,14 @@ class PipelineState:
|
|
|
446
513
|
G = graph_with_names(G, name_style)
|
|
447
514
|
|
|
448
515
|
ax.clear()
|
|
449
|
-
plot_nx_graph_reasonably(
|
|
516
|
+
plot_nx_graph_reasonably(
|
|
517
|
+
G,
|
|
518
|
+
ax=ax,
|
|
519
|
+
layout=local_layout,
|
|
520
|
+
node_kwargs=node_kwargs[slider_i],
|
|
521
|
+
edge_kwargs=edge_kwargs[slider_i],
|
|
522
|
+
label_kwargs=label_kwargs[slider_i],
|
|
523
|
+
)
|
|
450
524
|
ax.set_xlim(-1.2, 1.2)
|
|
451
525
|
ax.set_ylim(-1.2, 1.2)
|
|
452
526
|
|
|
@@ -467,6 +541,10 @@ class PipelineState:
|
|
|
467
541
|
class Pipeline:
|
|
468
542
|
"""A flexible NLP pipeline"""
|
|
469
543
|
|
|
544
|
+
#: all the possible parameters of the whole pipeline, that are
|
|
545
|
+
#: shared between steps
|
|
546
|
+
PipelineParameter = Literal["lang", "progress_reporter", "character_ner_tag"]
|
|
547
|
+
|
|
470
548
|
def __init__(
|
|
471
549
|
self,
|
|
472
550
|
steps: List[PipelineStep],
|
|
@@ -489,17 +567,27 @@ class Pipeline:
|
|
|
489
567
|
self.progress_reporter = get_progress_reporter(progress_report)
|
|
490
568
|
|
|
491
569
|
self.lang = lang
|
|
570
|
+
self.character_ner_tag = "PER"
|
|
492
571
|
self.warn = warn
|
|
493
572
|
|
|
494
|
-
def
|
|
495
|
-
"""
|
|
573
|
+
def _pipeline_init_steps_(self, ignored_steps: Optional[List[str]] = None):
|
|
574
|
+
"""Initialise steps with global pipeline parameters.
|
|
575
|
+
|
|
496
576
|
:param ignored_steps: a list of steps production. All steps
|
|
497
577
|
with a production in ``ignored_steps`` will be ignored.
|
|
498
578
|
"""
|
|
499
|
-
steps_progress_reporter =
|
|
579
|
+
steps_progress_reporter = self.progress_reporter.get_subreporter()
|
|
500
580
|
steps = self._non_ignored_steps(ignored_steps)
|
|
581
|
+
pipeline_params = {
|
|
582
|
+
"progress_reporter": steps_progress_reporter,
|
|
583
|
+
"character_ner_tag": self.character_ner_tag,
|
|
584
|
+
}
|
|
501
585
|
for step in steps:
|
|
502
|
-
step._pipeline_init_(self.lang,
|
|
586
|
+
step_additional_params = step._pipeline_init_(self.lang, **pipeline_params)
|
|
587
|
+
if not step_additional_params is None:
|
|
588
|
+
for key, value in step_additional_params.items():
|
|
589
|
+
setattr(self, key, value)
|
|
590
|
+
pipeline_params[key] = value
|
|
503
591
|
|
|
504
592
|
def _non_ignored_steps(
|
|
505
593
|
self, ignored_steps: Optional[List[str]]
|
|
@@ -542,13 +630,27 @@ class Pipeline:
|
|
|
542
630
|
return (
|
|
543
631
|
False,
|
|
544
632
|
[
|
|
545
|
-
|
|
633
|
+
"".join(
|
|
634
|
+
[
|
|
635
|
+
f"step {i + 1} ({step.__class__.__name__}) has unsatisfied needs. "
|
|
636
|
+
+ f"needs: {step.needs()}. "
|
|
637
|
+
+ f"available: {pipeline_state}). "
|
|
638
|
+
+ f"missing: {step.needs() - pipeline_state}."
|
|
639
|
+
]
|
|
640
|
+
),
|
|
546
641
|
],
|
|
547
642
|
)
|
|
548
643
|
|
|
549
644
|
if not step.optional_needs().issubset(pipeline_state):
|
|
550
645
|
warnings.append(
|
|
551
|
-
|
|
646
|
+
"".join(
|
|
647
|
+
[
|
|
648
|
+
f"step {i + 1} ({step.__class__.__name__}) has unsatisfied optional needs. "
|
|
649
|
+
+ f"needs: {step.optional_needs()}. "
|
|
650
|
+
+ f"available: {pipeline_state}). "
|
|
651
|
+
+ f"missing: {step.optional_needs() - pipeline_state}."
|
|
652
|
+
]
|
|
653
|
+
)
|
|
552
654
|
)
|
|
553
655
|
|
|
554
656
|
pipeline_state = pipeline_state.union(step.production())
|
|
@@ -575,9 +677,9 @@ class Pipeline:
|
|
|
575
677
|
raise ValueError(warnings_or_errors)
|
|
576
678
|
if self.warn:
|
|
577
679
|
for warning in warnings_or_errors:
|
|
578
|
-
print(f"[warning] : {warning}")
|
|
680
|
+
print(f"[warning] : {warning}", file=sys.stderr)
|
|
579
681
|
|
|
580
|
-
self.
|
|
682
|
+
self._pipeline_init_steps_(ignored_steps)
|
|
581
683
|
|
|
582
684
|
state = PipelineState(text)
|
|
583
685
|
# sets attributes to PipelineState dynamically. This ensures
|