renard-pipeline 0.4.0__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of renard-pipeline might be problematic. Click here for more details.

Files changed (39) hide show
  1. {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/PKG-INFO +58 -4
  2. renard_pipeline-0.5.0/README.md +89 -0
  3. {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/pyproject.toml +5 -2
  4. {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/graph_utils.py +11 -4
  5. {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/ner_utils.py +4 -0
  6. {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/pipeline/character_unification.py +26 -6
  7. {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/pipeline/characters_extraction.py +3 -1
  8. {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/pipeline/core.py +127 -25
  9. {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/pipeline/corefs/corefs.py +30 -31
  10. renard_pipeline-0.5.0/renard/pipeline/graph_extraction.py +604 -0
  11. {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/pipeline/ner.py +3 -2
  12. {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/pipeline/progress.py +32 -1
  13. {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/pipeline/speaker_attribution.py +2 -3
  14. renard_pipeline-0.5.0/renard/pipeline/tokenization.py +84 -0
  15. renard_pipeline-0.5.0/renard/plot_utils.py +80 -0
  16. {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/resources/hypocorisms/hypocorisms.py +3 -2
  17. {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/utils.py +57 -1
  18. renard_pipeline-0.4.0/README.md +0 -38
  19. renard_pipeline-0.4.0/renard/pipeline/graph_extraction.py +0 -515
  20. renard_pipeline-0.4.0/renard/pipeline/tokenization.py +0 -55
  21. renard_pipeline-0.4.0/renard/plot_utils.py +0 -67
  22. {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/LICENSE +0 -0
  23. {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/gender.py +0 -0
  24. {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/nltk_utils.py +0 -0
  25. {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/pipeline/__init__.py +0 -0
  26. {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/pipeline/corefs/__init__.py +0 -0
  27. {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/pipeline/preconfigured.py +0 -0
  28. {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/pipeline/preprocessing.py +0 -0
  29. {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/pipeline/quote_detection.py +0 -0
  30. {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/pipeline/sentiment_analysis.py +0 -0
  31. {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/pipeline/stanford_corenlp.py +0 -0
  32. {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/py.typed +0 -0
  33. {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/resources/hypocorisms/__init__.py +0 -0
  34. {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/resources/hypocorisms/datas/License.txt +0 -0
  35. {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/resources/hypocorisms/datas/hypocorisms.csv +0 -0
  36. {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/resources/pronouns/__init__.py +0 -0
  37. {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/resources/pronouns/pronouns.py +0 -0
  38. {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/resources/titles/__init__.py +0 -0
  39. {renard_pipeline-0.4.0 → renard_pipeline-0.5.0}/renard/resources/titles/titles.py +0 -0
@@ -1,7 +1,8 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: renard-pipeline
3
- Version: 0.4.0
3
+ Version: 0.5.0
4
4
  Summary: Relationships Extraction from NARrative Documents
5
+ Home-page: https://github.com/CompNet/Renard
5
6
  License: GPL-3.0-only
6
7
  Author: Arthur Amalvy
7
8
  Author-email: arthur.amalvy@univ-avignon.fr
@@ -27,17 +28,21 @@ Requires-Dist: seqeval (==1.2.2)
27
28
  Requires-Dist: spacy (>=3.5.0,<4.0.0) ; extra == "spacy"
28
29
  Requires-Dist: spacy-transformers (>=1.2.1,<2.0.0) ; extra == "spacy"
29
30
  Requires-Dist: stanza (>=1.3.0,<2.0.0) ; extra == "stanza"
30
- Requires-Dist: tibert (>=0.3.0,<0.4.0)
31
+ Requires-Dist: tibert (>=0.4.0,<0.5.0)
31
32
  Requires-Dist: torch (>=2.0.0,!=2.0.1)
32
33
  Requires-Dist: tqdm (>=4.62.3,<5.0.0)
33
34
  Requires-Dist: transformers (>=4.36.0,<5.0.0)
35
+ Project-URL: Documentation, https://compnet.github.io/Renard/
36
+ Project-URL: Repository, https://github.com/CompNet/Renard
34
37
  Description-Content-Type: text/markdown
35
38
 
36
39
  # Renard
37
40
 
41
+ [![DOI](https://joss.theoj.org/papers/10.21105/joss.06574/status.svg)](https://doi.org/10.21105/joss.06574)
42
+
38
43
  Renard (Relationships Extraction from NARrative Documents) is a library for creating and using custom character networks extraction pipelines. Renard can extract dynamic as well as static character networks.
39
44
 
40
- ![Character network extracted from "Pride and Prejudice"](./docs/pp_white_bg.svg)
45
+ ![The Renard logo](./docs/renard.svg)
41
46
 
42
47
 
43
48
  # Installation
@@ -46,6 +51,8 @@ You can install the latest version using pip:
46
51
 
47
52
  > pip install renard-pipeline
48
53
 
54
+ Currently, Renard supports Python 3.8, 3.9 and 3.10.
55
+
49
56
 
50
57
  # Documentation
51
58
 
@@ -56,7 +63,32 @@ If you need local documentation, it can be generated using `Sphinx`. From the `d
56
63
 
57
64
  # Tutorial
58
65
 
59
- `renard_tutorial.py` is a tutorial in the `jupytext` format. You can open it as a notebook in Jupyter Notebook (or export it as a notebook with `jupytext --to ipynb renard-tutorial.py`).
66
+ Renard's central concept is the `Pipeline`.A `Pipeline` is a list of `PipelineStep` that are run sequentially in order to extract a character graph from a document. Here is a simple example:
67
+
68
+ ```python
69
+ from renard.pipeline import Pipeline
70
+ from renard.pipeline.tokenization import NLTKTokenizer
71
+ from renard.pipeline.ner import NLTKNamedEntityRecognizer
72
+ from renard.pipeline.character_unification import GraphRulesCharacterUnifier
73
+ from renard.pipeline.graph_extraction import CoOccurrencesGraphExtractor
74
+
75
+ with open("./my_doc.txt") as f:
76
+ text = f.read()
77
+
78
+ pipeline = Pipeline(
79
+ [
80
+ NLTKTokenizer(),
81
+ NLTKNamedEntityRecognizer(),
82
+ GraphRulesCharacterUnifier(min_appearance=10),
83
+ CoOccurrencesGraphExtractor(co_occurrences_dist=25)
84
+ ]
85
+ )
86
+
87
+ out = pipeline(text)
88
+ ```
89
+
90
+ For more information, see `renard_tutorial.py`, which is a tutorial in the `jupytext` format. You can open it as a notebook in Jupyter Notebook (or export it as a notebook with `jupytext --to ipynb renard-tutorial.py`).
91
+
60
92
 
61
93
 
62
94
  # Running tests
@@ -72,3 +104,25 @@ Expensive tests are disabled by default. These can be run by setting the environ
72
104
 
73
105
  see [the "Contributing" section of the documentation](https://compnet.github.io/Renard/contributing.html).
74
106
 
107
+
108
+ # How to cite
109
+
110
+ If you use Renard in your research project, please cite it as follows:
111
+
112
+ ```bibtex
113
+ @Article{Amalvy2024,
114
+ doi = {10.21105/joss.06574},
115
+ year = {2024},
116
+ publisher = {The Open Journal},
117
+ volume = {9},
118
+ number = {98},
119
+ pages = {6574},
120
+ author = {Amalvy, A. and Labatut, V. and Dufour, R.},
121
+ title = {Renard: A Modular Pipeline for Extracting Character
122
+ Networks from Narrative Texts},
123
+ journal = {Journal of Open Source Software},
124
+ }
125
+ ```
126
+
127
+ We would be happy to hear about your usage of Renard, so don't hesitate to reach out!
128
+
@@ -0,0 +1,89 @@
1
+ # Renard
2
+
3
+ [![DOI](https://joss.theoj.org/papers/10.21105/joss.06574/status.svg)](https://doi.org/10.21105/joss.06574)
4
+
5
+ Renard (Relationships Extraction from NARrative Documents) is a library for creating and using custom character networks extraction pipelines. Renard can extract dynamic as well as static character networks.
6
+
7
+ ![The Renard logo](./docs/renard.svg)
8
+
9
+
10
+ # Installation
11
+
12
+ You can install the latest version using pip:
13
+
14
+ > pip install renard-pipeline
15
+
16
+ Currently, Renard supports Python 3.8, 3.9 and 3.10.
17
+
18
+
19
+ # Documentation
20
+
21
+ Documentation, including installation instructions, can be found at https://compnet.github.io/Renard/
22
+
23
+ If you need local documentation, it can be generated using `Sphinx`. From the `docs` directory, `make html` should create documentation under `docs/_build/html`.
24
+
25
+
26
+ # Tutorial
27
+
28
+ Renard's central concept is the `Pipeline`.A `Pipeline` is a list of `PipelineStep` that are run sequentially in order to extract a character graph from a document. Here is a simple example:
29
+
30
+ ```python
31
+ from renard.pipeline import Pipeline
32
+ from renard.pipeline.tokenization import NLTKTokenizer
33
+ from renard.pipeline.ner import NLTKNamedEntityRecognizer
34
+ from renard.pipeline.character_unification import GraphRulesCharacterUnifier
35
+ from renard.pipeline.graph_extraction import CoOccurrencesGraphExtractor
36
+
37
+ with open("./my_doc.txt") as f:
38
+ text = f.read()
39
+
40
+ pipeline = Pipeline(
41
+ [
42
+ NLTKTokenizer(),
43
+ NLTKNamedEntityRecognizer(),
44
+ GraphRulesCharacterUnifier(min_appearance=10),
45
+ CoOccurrencesGraphExtractor(co_occurrences_dist=25)
46
+ ]
47
+ )
48
+
49
+ out = pipeline(text)
50
+ ```
51
+
52
+ For more information, see `renard_tutorial.py`, which is a tutorial in the `jupytext` format. You can open it as a notebook in Jupyter Notebook (or export it as a notebook with `jupytext --to ipynb renard-tutorial.py`).
53
+
54
+
55
+
56
+ # Running tests
57
+
58
+ `Renard` uses `pytest` for testing. To launch tests, use the following command :
59
+
60
+ > poetry run python -m pytest tests
61
+
62
+ Expensive tests are disabled by default. These can be run by setting the environment variable `RENARD_TEST_ALL` to `1`.
63
+
64
+
65
+ # Contributing
66
+
67
+ see [the "Contributing" section of the documentation](https://compnet.github.io/Renard/contributing.html).
68
+
69
+
70
+ # How to cite
71
+
72
+ If you use Renard in your research project, please cite it as follows:
73
+
74
+ ```bibtex
75
+ @Article{Amalvy2024,
76
+ doi = {10.21105/joss.06574},
77
+ year = {2024},
78
+ publisher = {The Open Journal},
79
+ volume = {9},
80
+ number = {98},
81
+ pages = {6574},
82
+ author = {Amalvy, A. and Labatut, V. and Dufour, R.},
83
+ title = {Renard: A Modular Pipeline for Extracting Character
84
+ Networks from Narrative Texts},
85
+ journal = {Journal of Open Source Software},
86
+ }
87
+ ```
88
+
89
+ We would be happy to hear about your usage of Renard, so don't hesitate to reach out!
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "renard-pipeline"
3
- version = "0.4.0"
3
+ version = "0.5.0"
4
4
  description = "Relationships Extraction from NARrative Documents"
5
5
  authors = ["Arthur Amalvy <arthur.amalvy@univ-avignon.fr>"]
6
6
  license = "GPL-3.0-only"
@@ -8,6 +8,9 @@ readme = "README.md"
8
8
  packages = [
9
9
  { include = "renard" }
10
10
  ]
11
+ homepage = "https://github.com/CompNet/Renard"
12
+ repository = "https://github.com/CompNet/Renard"
13
+ documentation = "https://compnet.github.io/Renard/"
11
14
 
12
15
  [tool.poetry.dependencies]
13
16
  # optional dependencies
@@ -28,7 +31,7 @@ matplotlib = "^3.5.3"
28
31
  seqeval = "1.2.2"
29
32
  pandas = "^2.0.0"
30
33
  pytest = "^7.2.1"
31
- tibert = "^0.3.0"
34
+ tibert = "^0.4.0"
32
35
  grimbert = "^0.1.0"
33
36
  datasets = "^2.16.1"
34
37
 
@@ -70,10 +70,17 @@ def graph_with_names(
70
70
  else:
71
71
  name_style_fn = name_style
72
72
 
73
- return nx.relabel_nodes(
74
- G,
75
- {character: name_style_fn(character) for character in G.nodes()}, # type: ignore
76
- )
73
+ mapping = {}
74
+ for character in G.nodes():
75
+ # NOTE: it is *possible* to have a graph where nodes are not
76
+ # characters (for example, simple strings). Therefore, we are
77
+ # lenient here
78
+ try:
79
+ mapping[character] = name_style_fn(character)
80
+ except AttributeError:
81
+ mapping[character] = character
82
+
83
+ return nx.relabel_nodes(G, mapping)
77
84
 
78
85
 
79
86
  def layout_with_names(
@@ -110,6 +110,10 @@ class NERDataset(Dataset):
110
110
  elt_context_mask = self._context_mask[index]
111
111
  for i in range(len(element)):
112
112
  w2t = batch.word_to_tokens(0, i)
113
+ # w2t can be None in case of truncation, which can happen
114
+ # if `element' is too long
115
+ if w2t is None:
116
+ continue
113
117
  mask_value = elt_context_mask[i]
114
118
  tokens_mask = [mask_value] * (w2t.end - w2t.start)
115
119
  batch["context_mask"][w2t.start : w2t.end] = tokens_mask
@@ -61,6 +61,8 @@ def _assign_coreference_mentions(
61
61
  # we assign each chain to the character with highest name
62
62
  # occurence in it
63
63
  for chain in corefs:
64
+ if len(char_mentions) == 0:
65
+ break
64
66
  # determine the characters with the highest number of
65
67
  # occurences
66
68
  occ_counter = {}
@@ -98,8 +100,13 @@ class NaiveCharacterUnifier(PipelineStep):
98
100
  character for it to be valid
99
101
  """
100
102
  self.min_appearances = min_appearances
103
+ # a default value, will be est by _pipeline_init_
104
+ self.character_ner_tag = "PER"
101
105
  super().__init__()
102
106
 
107
+ def _pipeline_init_(self, lang: str, character_ner_tag: str, **kwargs):
108
+ self.character_ner_tag = character_ner_tag
109
+
103
110
  def __call__(
104
111
  self,
105
112
  text: str,
@@ -112,7 +119,7 @@ class NaiveCharacterUnifier(PipelineStep):
112
119
  :param tokens:
113
120
  :param entities:
114
121
  """
115
- persons = [e for e in entities if e.tag == "PER"]
122
+ persons = [e for e in entities if e.tag == self.character_ner_tag]
116
123
 
117
124
  characters = defaultdict(list)
118
125
  for entity in persons:
@@ -159,6 +166,7 @@ class GraphRulesCharacterUnifier(PipelineStep):
159
166
  min_appearances: int = 0,
160
167
  additional_hypocorisms: Optional[List[Tuple[str, List[str]]]] = None,
161
168
  link_corefs_mentions: bool = False,
169
+ ignore_lone_titles: Optional[Set[str]] = None,
162
170
  ) -> None:
163
171
  """
164
172
  :param min_appearances: minimum number of appearances of a
@@ -173,20 +181,27 @@ class GraphRulesCharacterUnifier(PipelineStep):
173
181
  extract a lot of spurious links. However, linking by
174
182
  coref is sometimes the only way to resolve a character
175
183
  alias.
184
+ :param ignore_lone_titles: a set of titles to ignore when
185
+ they stand on their own. This avoids extracting false
186
+ positives characters such as 'Mr.' or 'Miss'.
176
187
  """
177
188
  self.min_appearances = min_appearances
178
189
  self.additional_hypocorisms = additional_hypocorisms
179
190
  self.link_corefs_mentions = link_corefs_mentions
191
+ self.ignore_lone_titles = ignore_lone_titles or set()
192
+ self.character_ner_tag = "PER" # a default value, will be set by _pipeline_init
180
193
 
181
194
  super().__init__()
182
195
 
183
- def _pipeline_init_(self, lang: str, progress_reporter: ProgressReporter):
196
+ def _pipeline_init_(self, lang: str, character_ner_tag: str, **kwargs):
184
197
  self.hypocorism_gazetteer = HypocorismGazetteer(lang=lang)
185
198
  if not self.additional_hypocorisms is None:
186
199
  for name, nicknames in self.additional_hypocorisms:
187
200
  self.hypocorism_gazetteer._add_hypocorism_(name, nicknames)
188
201
 
189
- return super()._pipeline_init_(lang, progress_reporter)
202
+ self.character_ner_tag = character_ner_tag
203
+
204
+ return super()._pipeline_init_(lang, **kwargs)
190
205
 
191
206
  def __call__(
192
207
  self,
@@ -196,12 +211,17 @@ class GraphRulesCharacterUnifier(PipelineStep):
196
211
  ) -> Dict[str, Any]:
197
212
  import networkx as nx
198
213
 
199
- mentions = [m for m in entities if m.tag == "PER"]
200
- mentions_str = [" ".join(m.tokens) for m in mentions]
214
+ mentions = [m for m in entities if m.tag == self.character_ner_tag]
215
+ mentions_str = set(
216
+ filter(
217
+ lambda m: not m in self.ignore_lone_titles,
218
+ map(lambda m: " ".join(m.tokens), mentions),
219
+ )
220
+ )
201
221
 
202
222
  # * create a graph where each node is a mention detected by NER
203
223
  G = nx.Graph()
204
- for mention_str in set(mentions_str):
224
+ for mention_str in mentions_str:
205
225
  G.add_node(mention_str)
206
226
 
207
227
  # * HumanName local configuration - dependant on language
@@ -1,7 +1,9 @@
1
+ import sys
1
2
  import renard.pipeline.character_unification as cu
2
3
 
3
4
  print(
4
- "[warning] the characters_extraction module is deprecated. Use character_unification instead."
5
+ "[warning] the characters_extraction module is deprecated. Use character_unification instead.",
6
+ file=sys.stderr,
5
7
  )
6
8
 
7
9
  Character = cu.Character
@@ -50,6 +50,13 @@ class Mention:
50
50
  self_dict["end_idx"] = self.end_idx + shift
51
51
  return self.__class__(**self_dict)
52
52
 
53
+ def __eq__(self, other: Mention) -> bool:
54
+ return (
55
+ self.tokens == other.tokens
56
+ and self.start_idx == other.start_idx
57
+ and self.end_idx == other.end_idx
58
+ )
59
+
53
60
  def __hash__(self) -> int:
54
61
  return hash(tuple(self.tokens) + (self.start_idx, self.end_idx))
55
62
 
@@ -72,11 +79,18 @@ class PipelineStep:
72
79
  """Initialize the :class:`PipelineStep` with a given configuration."""
73
80
  pass
74
81
 
75
- def _pipeline_init_(self, lang: str, progress_reporter: ProgressReporter):
76
- """Set the step configuration that is common to the whole pipeline.
82
+ def _pipeline_init_(
83
+ self, lang: str, progress_reporter: ProgressReporter, **kwargs
84
+ ) -> Optional[Dict[Pipeline.PipelineParameter, Any]]:
85
+ """Set the step configuration that is common to the whole
86
+ pipeline.
77
87
 
78
- :param lang: ISO 639-3 language string
79
- :param progress_report:
88
+ :param lang: the lang of the whole pipeline
89
+ :param progress_reporter:
90
+ :param kwargs: additional pipeline parameters.
91
+
92
+ :return: a step can return a dictionary of pipeline params if
93
+ it wish to modify some of these.
80
94
  """
81
95
  supported_langs = self.supported_langs()
82
96
  if not supported_langs == "any" and not lang in supported_langs:
@@ -143,13 +157,14 @@ class PipelineState:
143
157
  #: input text
144
158
  text: Optional[str]
145
159
 
146
- #: text split into chapters
147
- chapters: Optional[List[str]] = None
160
+ #: text split into blocks of texts. When dynamic blocks are given,
161
+ #: the final network is dynamic, and split according to blocks.
162
+ dynamic_blocks: Optional[List[Tuple[int, int]]] = None
148
163
 
149
164
  #: text splitted in tokens
150
165
  tokens: Optional[List[str]] = None
151
- #: text splitted in tokens, by chapter
152
- chapter_tokens: Optional[List[List[str]]] = None
166
+ #: mapping from a character to its corresponding token
167
+ char2token: Optional[List[int]] = None
153
168
  #: text splitted into sentences, each sentence being a list of
154
169
  #: tokens
155
170
  sentences: Optional[List[List[str]]] = None
@@ -175,14 +190,12 @@ class PipelineState:
175
190
  #: network)
176
191
  character_network: Optional[Union[List[nx.Graph], nx.Graph]] = None
177
192
 
193
+ # aliases of self.character_network
178
194
  def get_characters_graph(self) -> Optional[Union[List[nx.Graph], nx.Graph]]:
179
- print(
180
- "[warning] the characters_graph attribute is deprecated, use character_network instead",
181
- file=sys.stderr,
182
- )
183
195
  return self.character_network
184
196
 
185
197
  characters_graph = property(get_characters_graph)
198
+ character_graph = property(get_characters_graph)
186
199
 
187
200
  def get_character(
188
201
  self, name: str, partial_match: bool = True
@@ -273,6 +286,9 @@ class PipelineState:
273
286
  cumulative: bool = False,
274
287
  stable_layout: bool = False,
275
288
  layout: Optional[CharactersGraphLayout] = None,
289
+ node_kwargs: Optional[List[Dict[str, Any]]] = None,
290
+ edge_kwargs: Optional[List[Dict[str, Any]]] = None,
291
+ label_kwargs: Optional[List[Dict[str, Any]]] = None,
276
292
  ):
277
293
  """Plot ``self.character_graph`` using reasonable default
278
294
  parameters, and save the produced figures in the specified
@@ -287,6 +303,9 @@ class PipelineState:
287
303
  timestep. Characters' positions are based on the final
288
304
  cumulative graph layout.
289
305
  :param layout: pre-computed graph layout
306
+ :param node_kwargs: passed to :func:`nx.draw_networkx_nodes`
307
+ :param edge_kwargs: passed to :func:`nx.draw_networkx_nodes`
308
+ :param label_kwargs: passed to :func:`nx.draw_networkx_labels`
290
309
  """
291
310
  import matplotlib.pyplot as plt
292
311
 
@@ -310,13 +329,24 @@ class PipelineState:
310
329
  )
311
330
  layout = layout_nx_graph_reasonably(layout_graph)
312
331
 
332
+ node_kwargs = node_kwargs or [{} for _ in range(len(self.character_network))]
333
+ edge_kwargs = edge_kwargs or [{} for _ in range(len(self.character_network))]
334
+ label_kwargs = label_kwargs or [{} for _ in range(len(self.character_network))]
335
+
313
336
  for i, G in enumerate(graphs):
314
337
  _, ax = plt.subplots()
315
338
  local_layout = layout
316
339
  if not local_layout is None:
317
340
  local_layout = layout_with_names(G, local_layout, name_style)
318
341
  G = graph_with_names(G, name_style=name_style)
319
- plot_nx_graph_reasonably(G, ax=ax, layout=local_layout)
342
+ plot_nx_graph_reasonably(
343
+ G,
344
+ ax=ax,
345
+ layout=local_layout,
346
+ node_kwargs=node_kwargs[i],
347
+ edge_kwargs=edge_kwargs[i],
348
+ label_kwargs=label_kwargs[i],
349
+ )
320
350
  plt.savefig(f"{directory}/{i}.png")
321
351
  plt.close()
322
352
 
@@ -328,6 +358,9 @@ class PipelineState:
328
358
  ] = "most_frequent",
329
359
  layout: Optional[CharactersGraphLayout] = None,
330
360
  fig: Optional[plt.Figure] = None,
361
+ node_kwargs: Optional[Dict[str, Any]] = None,
362
+ edge_kwargs: Optional[Dict[str, Any]] = None,
363
+ label_kwargs: Optional[Dict[str, Any]] = None,
331
364
  ):
332
365
  """Plot ``self.character_graph`` using reasonable parameters,
333
366
  and save the produced figure to a file
@@ -337,6 +370,9 @@ class PipelineState:
337
370
  :param layout: pre-computed graph layout
338
371
  :param fig: if specified, this matplotlib figure will be used
339
372
  for plotting
373
+ :param node_kwargs: passed to :func:`nx.draw_networkx_nodes`
374
+ :param edge_kwargs: passed to :func:`nx.draw_networkx_nodes`
375
+ :param label_kwargs: passed to :func:`nx.draw_networkx_labels`
340
376
  """
341
377
  import matplotlib.pyplot as plt
342
378
 
@@ -354,7 +390,14 @@ class PipelineState:
354
390
  fig.set_dpi(300)
355
391
  fig.set_size_inches(24, 24)
356
392
  ax = fig.add_subplot(111)
357
- plot_nx_graph_reasonably(G, ax=ax, layout=layout)
393
+ plot_nx_graph_reasonably(
394
+ G,
395
+ ax=ax,
396
+ layout=layout,
397
+ node_kwargs=node_kwargs,
398
+ edge_kwargs=edge_kwargs,
399
+ label_kwargs=label_kwargs,
400
+ )
358
401
  plt.savefig(path)
359
402
  plt.close()
360
403
 
@@ -368,6 +411,9 @@ class PipelineState:
368
411
  graph_start_idx: int = 1,
369
412
  stable_layout: bool = False,
370
413
  layout: Optional[CharactersGraphLayout] = None,
414
+ node_kwargs: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
415
+ edge_kwargs: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
416
+ label_kwargs: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
371
417
  ):
372
418
  """Plot ``self.character_network`` using reasonable default
373
419
  parameters
@@ -393,6 +439,9 @@ class PipelineState:
393
439
  same position in space at each timestep. Characters'
394
440
  positions are based on the final cumulative graph layout.
395
441
  :param layout: pre-computed graph layout
442
+ :param node_kwargs: passed to :func:`nx.draw_networkx_nodes`
443
+ :param edge_kwargs: passed to :func:`nx.draw_networkx_nodes`
444
+ :param label_kwargs: passed to :func:`nx.draw_networkx_labels`
396
445
  """
397
446
  import matplotlib.pyplot as plt
398
447
  from matplotlib.widgets import Slider
@@ -411,13 +460,30 @@ class PipelineState:
411
460
  fig.set_dpi(300)
412
461
  fig.set_size_inches(24, 24)
413
462
  ax = fig.add_subplot(111)
414
- plot_nx_graph_reasonably(G, ax=ax, layout=layout)
463
+ assert not isinstance(node_kwargs, list)
464
+ assert not isinstance(edge_kwargs, list)
465
+ assert not isinstance(label_kwargs, list)
466
+ plot_nx_graph_reasonably(
467
+ G,
468
+ ax=ax,
469
+ layout=layout,
470
+ node_kwargs=node_kwargs,
471
+ edge_kwargs=edge_kwargs,
472
+ label_kwargs=label_kwargs,
473
+ )
415
474
  return
416
475
 
417
476
  if not isinstance(self.character_network, list):
418
477
  raise TypeError
419
478
  # self.character_network is a list: plot a dynamic graph
420
479
 
480
+ node_kwargs = node_kwargs or [{} for _ in range(len(self.character_network))]
481
+ assert isinstance(node_kwargs, list)
482
+ edge_kwargs = edge_kwargs or [{} for _ in range(len(self.character_network))]
483
+ assert isinstance(edge_kwargs, list)
484
+ label_kwargs = label_kwargs or [{} for _ in range(len(self.character_network))]
485
+ assert isinstance(label_kwargs, list)
486
+
421
487
  if fig is None:
422
488
  fig, ax = plt.subplots()
423
489
  assert not fig is None
@@ -433,12 +499,13 @@ class PipelineState:
433
499
 
434
500
  def update(slider_value):
435
501
  assert isinstance(self.character_network, list)
502
+ slider_i = int(slider_value) - 1
436
503
 
437
504
  character_networks = self.character_network
438
505
  if cumulative:
439
506
  character_networks = cumulative_character_networks
440
507
 
441
- G = character_networks[int(slider_value) - 1]
508
+ G = character_networks[slider_i]
442
509
 
443
510
  local_layout = layout
444
511
  if not local_layout is None:
@@ -446,7 +513,14 @@ class PipelineState:
446
513
  G = graph_with_names(G, name_style)
447
514
 
448
515
  ax.clear()
449
- plot_nx_graph_reasonably(G, ax=ax, layout=local_layout)
516
+ plot_nx_graph_reasonably(
517
+ G,
518
+ ax=ax,
519
+ layout=local_layout,
520
+ node_kwargs=node_kwargs[slider_i],
521
+ edge_kwargs=edge_kwargs[slider_i],
522
+ label_kwargs=label_kwargs[slider_i],
523
+ )
450
524
  ax.set_xlim(-1.2, 1.2)
451
525
  ax.set_ylim(-1.2, 1.2)
452
526
 
@@ -467,6 +541,10 @@ class PipelineState:
467
541
  class Pipeline:
468
542
  """A flexible NLP pipeline"""
469
543
 
544
+ #: all the possible parameters of the whole pipeline, that are
545
+ #: shared between steps
546
+ PipelineParameter = Literal["lang", "progress_reporter", "character_ner_tag"]
547
+
470
548
  def __init__(
471
549
  self,
472
550
  steps: List[PipelineStep],
@@ -489,17 +567,27 @@ class Pipeline:
489
567
  self.progress_reporter = get_progress_reporter(progress_report)
490
568
 
491
569
  self.lang = lang
570
+ self.character_ner_tag = "PER"
492
571
  self.warn = warn
493
572
 
494
- def _pipeline_init_steps(self, ignored_steps: Optional[List[str]] = None):
495
- """
573
+ def _pipeline_init_steps_(self, ignored_steps: Optional[List[str]] = None):
574
+ """Initialise steps with global pipeline parameters.
575
+
496
576
  :param ignored_steps: a list of steps production. All steps
497
577
  with a production in ``ignored_steps`` will be ignored.
498
578
  """
499
- steps_progress_reporter = get_progress_reporter(self.progress_report)
579
+ steps_progress_reporter = self.progress_reporter.get_subreporter()
500
580
  steps = self._non_ignored_steps(ignored_steps)
581
+ pipeline_params = {
582
+ "progress_reporter": steps_progress_reporter,
583
+ "character_ner_tag": self.character_ner_tag,
584
+ }
501
585
  for step in steps:
502
- step._pipeline_init_(self.lang, steps_progress_reporter)
586
+ step_additional_params = step._pipeline_init_(self.lang, **pipeline_params)
587
+ if not step_additional_params is None:
588
+ for key, value in step_additional_params.items():
589
+ setattr(self, key, value)
590
+ pipeline_params[key] = value
503
591
 
504
592
  def _non_ignored_steps(
505
593
  self, ignored_steps: Optional[List[str]]
@@ -542,13 +630,27 @@ class Pipeline:
542
630
  return (
543
631
  False,
544
632
  [
545
- f"step {i + 1} ({step.__class__.__name__}) has unsatisfied needs (needs : {step.needs()}, available : {pipeline_state})"
633
+ "".join(
634
+ [
635
+ f"step {i + 1} ({step.__class__.__name__}) has unsatisfied needs. "
636
+ + f"needs: {step.needs()}. "
637
+ + f"available: {pipeline_state}). "
638
+ + f"missing: {step.needs() - pipeline_state}."
639
+ ]
640
+ ),
546
641
  ],
547
642
  )
548
643
 
549
644
  if not step.optional_needs().issubset(pipeline_state):
550
645
  warnings.append(
551
- f"step {i + 1} ({step.__class__.__name__}) has unsatisfied optional needs : (optional needs : {step.optional_needs()}, available : {pipeline_state})"
646
+ "".join(
647
+ [
648
+ f"step {i + 1} ({step.__class__.__name__}) has unsatisfied optional needs. "
649
+ + f"needs: {step.optional_needs()}. "
650
+ + f"available: {pipeline_state}). "
651
+ + f"missing: {step.optional_needs() - pipeline_state}."
652
+ ]
653
+ )
552
654
  )
553
655
 
554
656
  pipeline_state = pipeline_state.union(step.production())
@@ -575,9 +677,9 @@ class Pipeline:
575
677
  raise ValueError(warnings_or_errors)
576
678
  if self.warn:
577
679
  for warning in warnings_or_errors:
578
- print(f"[warning] : {warning}")
680
+ print(f"[warning] : {warning}", file=sys.stderr)
579
681
 
580
- self._pipeline_init_steps(ignored_steps)
682
+ self._pipeline_init_steps_(ignored_steps)
581
683
 
582
684
  state = PipelineState(text)
583
685
  # sets attributes to PipelineState dynamically. This ensures