renard-pipeline 0.4.1__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of renard-pipeline might be problematic. Click here for more details.

Files changed (42) hide show
  1. {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/PKG-INFO +45 -20
  2. {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/README.md +26 -2
  3. {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/pyproject.toml +19 -19
  4. {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/graph_utils.py +11 -4
  5. {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/ner_utils.py +24 -14
  6. {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/pipeline/character_unification.py +72 -19
  7. {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/pipeline/characters_extraction.py +3 -1
  8. {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/pipeline/core.py +141 -26
  9. {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/pipeline/corefs/corefs.py +32 -33
  10. renard_pipeline-0.6.0/renard/pipeline/graph_extraction.py +604 -0
  11. renard_pipeline-0.6.0/renard/pipeline/ner/__init__.py +1 -0
  12. {renard_pipeline-0.4.1/renard/pipeline → renard_pipeline-0.6.0/renard/pipeline/ner}/ner.py +47 -76
  13. renard_pipeline-0.6.0/renard/pipeline/ner/retrieval.py +375 -0
  14. {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/pipeline/progress.py +32 -1
  15. {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/pipeline/speaker_attribution.py +2 -3
  16. renard_pipeline-0.6.0/renard/pipeline/tokenization.py +84 -0
  17. renard_pipeline-0.6.0/renard/plot_utils.py +87 -0
  18. renard_pipeline-0.6.0/renard/resources/determiners/__init__.py +1 -0
  19. renard_pipeline-0.6.0/renard/resources/determiners/determiners.py +41 -0
  20. {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/resources/hypocorisms/hypocorisms.py +3 -2
  21. {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/utils.py +57 -1
  22. renard_pipeline-0.4.1/renard/pipeline/graph_extraction.py +0 -515
  23. renard_pipeline-0.4.1/renard/pipeline/tokenization.py +0 -55
  24. renard_pipeline-0.4.1/renard/plot_utils.py +0 -67
  25. {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/LICENSE +0 -0
  26. {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/gender.py +0 -0
  27. {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/nltk_utils.py +0 -0
  28. {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/pipeline/__init__.py +0 -0
  29. {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/pipeline/corefs/__init__.py +0 -0
  30. {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/pipeline/preconfigured.py +0 -0
  31. {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/pipeline/preprocessing.py +0 -0
  32. {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/pipeline/quote_detection.py +0 -0
  33. {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/pipeline/sentiment_analysis.py +0 -0
  34. {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/pipeline/stanford_corenlp.py +0 -0
  35. {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/py.typed +0 -0
  36. {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/resources/hypocorisms/__init__.py +0 -0
  37. {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/resources/hypocorisms/datas/License.txt +0 -0
  38. {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/resources/hypocorisms/datas/hypocorisms.csv +0 -0
  39. {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/resources/pronouns/__init__.py +0 -0
  40. {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/resources/pronouns/pronouns.py +0 -0
  41. {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/resources/titles/__init__.py +0 -0
  42. {renard_pipeline-0.4.1 → renard_pipeline-0.6.0}/renard/resources/titles/titles.py +0 -0
@@ -1,46 +1,49 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: renard-pipeline
3
- Version: 0.4.1
3
+ Version: 0.6.0
4
4
  Summary: Relationships Extraction from NARrative Documents
5
5
  Home-page: https://github.com/CompNet/Renard
6
6
  License: GPL-3.0-only
7
7
  Author: Arthur Amalvy
8
8
  Author-email: arthur.amalvy@univ-avignon.fr
9
- Requires-Python: >=3.8,<3.11
9
+ Requires-Python: >=3.8,<3.12
10
10
  Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
11
11
  Classifier: Programming Language :: Python :: 3
12
12
  Classifier: Programming Language :: Python :: 3.8
13
13
  Classifier: Programming Language :: Python :: 3.9
14
14
  Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
15
16
  Provides-Extra: spacy
16
17
  Provides-Extra: stanza
17
- Requires-Dist: coreferee (>=1.4.0,<2.0.0) ; extra == "spacy"
18
- Requires-Dist: datasets (>=2.16.1,<3.0.0)
19
- Requires-Dist: grimbert (>=0.1.0,<0.2.0)
20
- Requires-Dist: matplotlib (>=3.5.3,<4.0.0)
21
- Requires-Dist: more-itertools (>=10.1.0,<11.0.0)
22
- Requires-Dist: nameparser (>=1.1.0,<2.0.0)
23
- Requires-Dist: networkx (>=2.6.3,<3.0.0)
24
- Requires-Dist: nltk (>=3.6.5,<4.0.0)
25
- Requires-Dist: pandas (>=2.0.0,<3.0.0)
26
- Requires-Dist: pytest (>=7.2.1,<8.0.0)
27
- Requires-Dist: seqeval (==1.2.2)
28
- Requires-Dist: spacy (>=3.5.0,<4.0.0) ; extra == "spacy"
29
- Requires-Dist: spacy-transformers (>=1.2.1,<2.0.0) ; extra == "spacy"
30
- Requires-Dist: stanza (>=1.3.0,<2.0.0) ; extra == "stanza"
31
- Requires-Dist: tibert (>=0.3.0,<0.4.0)
18
+ Requires-Dist: coreferee (>=1.4,<2.0) ; extra == "spacy"
19
+ Requires-Dist: datasets (>=3.0,<4.0)
20
+ Requires-Dist: grimbert (>=0.1,<0.2)
21
+ Requires-Dist: matplotlib (>=3.5,<4.0)
22
+ Requires-Dist: more-itertools (>=10.5,<11.0)
23
+ Requires-Dist: nameparser (>=1.1,<2.0)
24
+ Requires-Dist: networkx (>=3.0,<4.0)
25
+ Requires-Dist: nltk (>=3.9,<4.0)
26
+ Requires-Dist: pandas (>=2.0,<3.0)
27
+ Requires-Dist: pytest (>=8.3.0,<9.0.0)
28
+ Requires-Dist: rank-bm25 (>=0.2.2,<0.3.0)
29
+ Requires-Dist: spacy (>=3.5,<4.0) ; extra == "spacy"
30
+ Requires-Dist: spacy-transformers (>=1.3,<2.0) ; extra == "spacy"
31
+ Requires-Dist: stanza (>=1.3,<2.0) ; extra == "stanza"
32
+ Requires-Dist: tibert (>=0.5,<0.6)
32
33
  Requires-Dist: torch (>=2.0.0,!=2.0.1)
33
34
  Requires-Dist: tqdm (>=4.62.3,<5.0.0)
34
- Requires-Dist: transformers (>=4.36.0,<5.0.0)
35
+ Requires-Dist: transformers (>=4.36,<5.0)
35
36
  Project-URL: Documentation, https://compnet.github.io/Renard/
36
37
  Project-URL: Repository, https://github.com/CompNet/Renard
37
38
  Description-Content-Type: text/markdown
38
39
 
39
40
  # Renard
40
41
 
41
- Renard (Relationships Extraction from NARrative Documents) is a library for creating and using custom character networks extraction pipelines. Renard can extract dynamic as well as static character networks.
42
+ [![DOI](https://joss.theoj.org/papers/10.21105/joss.06574/status.svg)](https://doi.org/10.21105/joss.06574)
42
43
 
43
- ![Character network extracted from "Pride and Prejudice"](./docs/pp_white_bg.svg)
44
+ Renard (Relationship Extraction from NARrative Documents) is a library for creating and using custom character networks extraction pipelines. Renard can extract dynamic as well as static character networks.
45
+
46
+ ![The Renard logo](./docs/renard.svg)
44
47
 
45
48
 
46
49
  # Installation
@@ -102,3 +105,25 @@ Expensive tests are disabled by default. These can be run by setting the environ
102
105
 
103
106
  see [the "Contributing" section of the documentation](https://compnet.github.io/Renard/contributing.html).
104
107
 
108
+
109
+ # How to cite
110
+
111
+ If you use Renard in your research project, please cite it as follows:
112
+
113
+ ```bibtex
114
+ @Article{Amalvy2024,
115
+ doi = {10.21105/joss.06574},
116
+ year = {2024},
117
+ publisher = {The Open Journal},
118
+ volume = {9},
119
+ number = {98},
120
+ pages = {6574},
121
+ author = {Amalvy, A. and Labatut, V. and Dufour, R.},
122
+ title = {Renard: A Modular Pipeline for Extracting Character
123
+ Networks from Narrative Texts},
124
+ journal = {Journal of Open Source Software},
125
+ }
126
+ ```
127
+
128
+ We would be happy to hear about your usage of Renard, so don't hesitate to reach out!
129
+
@@ -1,8 +1,10 @@
1
1
  # Renard
2
2
 
3
- Renard (Relationships Extraction from NARrative Documents) is a library for creating and using custom character networks extraction pipelines. Renard can extract dynamic as well as static character networks.
3
+ [![DOI](https://joss.theoj.org/papers/10.21105/joss.06574/status.svg)](https://doi.org/10.21105/joss.06574)
4
4
 
5
- ![Character network extracted from "Pride and Prejudice"](./docs/pp_white_bg.svg)
5
+ Renard (Relationship Extraction from NARrative Documents) is a library for creating and using custom character networks extraction pipelines. Renard can extract dynamic as well as static character networks.
6
+
7
+ ![The Renard logo](./docs/renard.svg)
6
8
 
7
9
 
8
10
  # Installation
@@ -63,3 +65,25 @@ Expensive tests are disabled by default. These can be run by setting the environ
63
65
  # Contributing
64
66
 
65
67
  see [the "Contributing" section of the documentation](https://compnet.github.io/Renard/contributing.html).
68
+
69
+
70
+ # How to cite
71
+
72
+ If you use Renard in your research project, please cite it as follows:
73
+
74
+ ```bibtex
75
+ @Article{Amalvy2024,
76
+ doi = {10.21105/joss.06574},
77
+ year = {2024},
78
+ publisher = {The Open Journal},
79
+ volume = {9},
80
+ number = {98},
81
+ pages = {6574},
82
+ author = {Amalvy, A. and Labatut, V. and Dufour, R.},
83
+ title = {Renard: A Modular Pipeline for Extracting Character
84
+ Networks from Narrative Texts},
85
+ journal = {Journal of Open Source Software},
86
+ }
87
+ ```
88
+
89
+ We would be happy to hear about your usage of Renard, so don't hesitate to reach out!
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "renard-pipeline"
3
- version = "0.4.1"
3
+ version = "0.6.0"
4
4
  description = "Relationships Extraction from NARrative Documents"
5
5
  authors = ["Arthur Amalvy <arthur.amalvy@univ-avignon.fr>"]
6
6
  license = "GPL-3.0-only"
@@ -14,29 +14,29 @@ documentation = "https://compnet.github.io/Renard/"
14
14
 
15
15
  [tool.poetry.dependencies]
16
16
  # optional dependencies
17
- stanza = { version = "^1.3.0", optional = true }
18
- spacy = { version = "^3.5.0", optional = true }
19
- coreferee = { version = "^1.4.0", optional = true }
20
- spacy-transformers = {version = "^1.2.1", optional = true}
17
+ stanza = { version = "^1.3", optional = true }
18
+ spacy = { version = "^3.5", optional = true }
19
+ coreferee = { version = "^1.4", optional = true }
20
+ spacy-transformers = {version = "^1.3", optional = true}
21
21
  # required dependencies
22
- python = "^3.8,<3.11"
22
+ python = "^3.8,<3.12"
23
23
  torch = ">=2.0.0, !=2.0.1"
24
- transformers = "^4.36.0"
25
- nltk = "^3.6.5"
24
+ transformers = "^4.36"
25
+ nltk = "^3.9"
26
26
  tqdm = "^4.62.3"
27
- networkx = "^2.6.3"
28
- more-itertools = "^10.1.0"
29
- nameparser = "^1.1.0"
30
- matplotlib = "^3.5.3"
31
- seqeval = "1.2.2"
32
- pandas = "^2.0.0"
33
- pytest = "^7.2.1"
34
- tibert = "^0.3.0"
35
- grimbert = "^0.1.0"
36
- datasets = "^2.16.1"
27
+ networkx = "^3.0"
28
+ more-itertools = "^10.5"
29
+ nameparser = "^1.1"
30
+ matplotlib = "^3.5"
31
+ pandas = "^2.0"
32
+ pytest = "^8.3.0"
33
+ tibert = "^0.5"
34
+ grimbert = "^0.1"
35
+ datasets = "^3.0"
36
+ rank-bm25 = "^0.2.2"
37
37
 
38
38
  [tool.poetry.dev-dependencies]
39
- hypothesis = "^6.24.0"
39
+ hypothesis = "^6.82"
40
40
  Sphinx = "^4.3.1"
41
41
  sphinx-rtd-theme = "^1.0.0"
42
42
  sphinx-autodoc-typehints = "^1.12.0"
@@ -70,10 +70,17 @@ def graph_with_names(
70
70
  else:
71
71
  name_style_fn = name_style
72
72
 
73
- return nx.relabel_nodes(
74
- G,
75
- {character: name_style_fn(character) for character in G.nodes()}, # type: ignore
76
- )
73
+ mapping = {}
74
+ for character in G.nodes():
75
+ # NOTE: it is *possible* to have a graph where nodes are not
76
+ # characters (for example, simple strings). Therefore, we are
77
+ # lenient here
78
+ try:
79
+ mapping[character] = name_style_fn(character)
80
+ except AttributeError:
81
+ mapping[character] = character
82
+
83
+ return nx.relabel_nodes(G, mapping)
77
84
 
78
85
 
79
86
  def layout_with_names(
@@ -74,7 +74,7 @@ class DataCollatorForTokenClassificationWithBatchEncoding:
74
74
  class NERDataset(Dataset):
75
75
  """
76
76
  :ivar _context_mask: for each element, a mask indicating which
77
- tokens are part of the context (1 for context, 0 for text on
77
+ tokens are part of the context (0 for context, 1 for text on
78
78
  which to perform inference). The mask allows to discard
79
79
  predictions made for context at inference time, even though
80
80
  the context can still be passed as input to the model.
@@ -92,11 +92,11 @@ class NERDataset(Dataset):
92
92
  assert all(
93
93
  [len(cm) == len(elt) for elt, cm in zip(self.elements, context_mask)]
94
94
  )
95
- self._context_mask = context_mask or [[0] * len(elt) for elt in self.elements]
95
+ self._context_mask = context_mask or [[1] * len(elt) for elt in self.elements]
96
96
 
97
97
  self.tokenizer = tokenizer
98
98
 
99
- def __getitem__(self, index: Union[int, List[int]]) -> BatchEncoding:
99
+ def __getitem__(self, index: int) -> BatchEncoding:
100
100
  element = self.elements[index]
101
101
 
102
102
  batch = self.tokenizer(
@@ -104,15 +104,18 @@ class NERDataset(Dataset):
104
104
  truncation=True,
105
105
  max_length=512, # TODO
106
106
  is_split_into_words=True,
107
+ return_length=True,
107
108
  )
108
109
 
109
- batch["context_mask"] = [0] * len(batch["input_ids"])
110
- elt_context_mask = self._context_mask[index]
111
- for i in range(len(element)):
112
- w2t = batch.word_to_tokens(0, i)
113
- mask_value = elt_context_mask[i]
114
- tokens_mask = [mask_value] * (w2t.end - w2t.start)
115
- batch["context_mask"][w2t.start : w2t.end] = tokens_mask
110
+ length = batch["length"][0]
111
+ del batch["length"]
112
+ if self.tokenizer.truncation_side == "right":
113
+ batch["context_mask"] = self._context_mask[index][:length]
114
+ else:
115
+ assert self.tokenizer.truncation_side == "left"
116
+ batch["context_mask"] = self._context_mask[index][
117
+ len(batch["input_ids"]) - length :
118
+ ]
116
119
 
117
120
  return batch
118
121
 
@@ -181,6 +184,7 @@ def load_conll2002_bio(
181
184
  path: str,
182
185
  tag_conversion_map: Optional[Dict[str, str]] = None,
183
186
  separator: str = "\t",
187
+ max_sent_len: Optional[int] = None,
184
188
  **kwargs,
185
189
  ) -> Tuple[List[List[str]], List[str], List[NEREntity]]:
186
190
  """Load a file under CoNLL2022 BIO format. Sentences are expected
@@ -192,7 +196,9 @@ def load_conll2002_bio(
192
196
  :param separator: separator between token and BIO tags
193
197
  :param tag_conversion_map: conversion map for tags found in the
194
198
  input file. Example : ``{'B': 'B-PER', 'I': 'I-PER'}``
195
- :param kwargs: additional kwargs for ``open`` (such as
199
+ :param max_sent_len: if specified, maximum length, in tokens, of
200
+ sentences.
201
+ :param kwargs: additional kwargs for :func:`open` (such as
196
202
  ``encoding`` or ``newline``).
197
203
 
198
204
  :return: ``(sentences, tokens, entities)``
@@ -207,7 +213,9 @@ def load_conll2002_bio(
207
213
  tags = []
208
214
  for line in raw_data.split("\n"):
209
215
  line = line.strip("\n")
210
- if re.fullmatch(r"\s*", line):
216
+ if re.fullmatch(r"\s*", line) or (
217
+ not max_sent_len is None and len(sent_tokens) >= max_sent_len
218
+ ):
211
219
  if len(sent_tokens) == 0:
212
220
  continue
213
221
  sents.append(sent_tokens)
@@ -227,6 +235,7 @@ def hgdataset_from_conll2002(
227
235
  path: str,
228
236
  tag_conversion_map: Optional[Dict[str, str]] = None,
229
237
  separator: str = "\t",
238
+ max_sent_len: Optional[int] = None,
230
239
  **kwargs,
231
240
  ) -> HGDataset:
232
241
  """Load a CoNLL-2002 file as a Huggingface Dataset.
@@ -234,12 +243,13 @@ def hgdataset_from_conll2002(
234
243
  :param path: passed to :func:`.load_conll2002_bio`
235
244
  :param tag_conversion_map: passed to :func:`load_conll2002_bio`
236
245
  :param separator: passed to :func:`load_conll2002_bio`
237
- :param kwargs: passed to :func:`load_conll2002_bio`
246
+ :param max_sent_len: passed to :func:`load_conll2002_bio`
247
+ :param kwargs: additional kwargs for :func:`open`
238
248
 
239
249
  :return: a :class:`datasets.Dataset` with features 'tokens' and 'labels'.
240
250
  """
241
251
  sentences, tokens, entities = load_conll2002_bio(
242
- path, tag_conversion_map, separator, **kwargs
252
+ path, tag_conversion_map, separator, max_sent_len, **kwargs
243
253
  )
244
254
 
245
255
  # convert entities to labels
@@ -1,5 +1,5 @@
1
1
  from typing import Any, Dict, List, FrozenSet, Set, Optional, Tuple, Union, Literal
2
- import copy
2
+ import re, sys
3
3
  from itertools import combinations
4
4
  from collections import defaultdict, Counter
5
5
  from dataclasses import dataclass
@@ -11,6 +11,7 @@ from renard.pipeline.ner import NEREntity
11
11
  from renard.pipeline.progress import ProgressReporter
12
12
  from renard.resources.hypocorisms import HypocorismGazetteer
13
13
  from renard.resources.pronouns import is_a_female_pronoun, is_a_male_pronoun
14
+ from renard.resources.determiners import singular_determiners
14
15
  from renard.resources.titles import is_a_male_title, is_a_female_title, all_titles
15
16
 
16
17
 
@@ -61,6 +62,8 @@ def _assign_coreference_mentions(
61
62
  # we assign each chain to the character with highest name
62
63
  # occurence in it
63
64
  for chain in corefs:
65
+ if len(char_mentions) == 0:
66
+ break
64
67
  # determine the characters with the highest number of
65
68
  # occurences
66
69
  occ_counter = {}
@@ -98,8 +101,13 @@ class NaiveCharacterUnifier(PipelineStep):
98
101
  character for it to be valid
99
102
  """
100
103
  self.min_appearances = min_appearances
104
+ # a default value, will be est by _pipeline_init_
105
+ self.character_ner_tag = "PER"
101
106
  super().__init__()
102
107
 
108
+ def _pipeline_init_(self, lang: str, character_ner_tag: str, **kwargs):
109
+ self.character_ner_tag = character_ner_tag
110
+
103
111
  def __call__(
104
112
  self,
105
113
  text: str,
@@ -112,7 +120,7 @@ class NaiveCharacterUnifier(PipelineStep):
112
120
  :param tokens:
113
121
  :param entities:
114
122
  """
115
- persons = [e for e in entities if e.tag == "PER"]
123
+ persons = [e for e in entities if e.tag == self.character_ner_tag]
116
124
 
117
125
  characters = defaultdict(list)
118
126
  for entity in persons:
@@ -159,6 +167,8 @@ class GraphRulesCharacterUnifier(PipelineStep):
159
167
  min_appearances: int = 0,
160
168
  additional_hypocorisms: Optional[List[Tuple[str, List[str]]]] = None,
161
169
  link_corefs_mentions: bool = False,
170
+ ignore_lone_titles: Optional[Set[str]] = None,
171
+ ignore_leading_determiner: bool = False,
162
172
  ) -> None:
163
173
  """
164
174
  :param min_appearances: minimum number of appearances of a
@@ -173,20 +183,32 @@ class GraphRulesCharacterUnifier(PipelineStep):
173
183
  extract a lot of spurious links. However, linking by
174
184
  coref is sometimes the only way to resolve a character
175
185
  alias.
186
+ :param ignore_lone_titles: a set of titles to ignore when they
187
+ stand on their own. This avoids extracting false
188
+ positives characters such as 'Mr.' or 'Miss'.
189
+ :param ignore_leading_determiner: if ``True``, will ignore the
190
+ leading determiner when applying unification rules. This
191
+ is useful if the NER model used in the pipeline adds
192
+ leading determiners as part of entites.
176
193
  """
177
194
  self.min_appearances = min_appearances
178
195
  self.additional_hypocorisms = additional_hypocorisms
179
196
  self.link_corefs_mentions = link_corefs_mentions
197
+ self.ignore_lone_titles = ignore_lone_titles or set()
198
+ self.character_ner_tag = "PER" # a default value, will be set by _pipeline_init
199
+ self.ignore_leading_determiner = ignore_leading_determiner
180
200
 
181
201
  super().__init__()
182
202
 
183
- def _pipeline_init_(self, lang: str, progress_reporter: ProgressReporter):
203
+ def _pipeline_init_(self, lang: str, character_ner_tag: str, **kwargs):
184
204
  self.hypocorism_gazetteer = HypocorismGazetteer(lang=lang)
185
205
  if not self.additional_hypocorisms is None:
186
206
  for name, nicknames in self.additional_hypocorisms:
187
207
  self.hypocorism_gazetteer._add_hypocorism_(name, nicknames)
188
208
 
189
- return super()._pipeline_init_(lang, progress_reporter)
209
+ self.character_ner_tag = character_ner_tag
210
+
211
+ return super()._pipeline_init_(lang, **kwargs)
190
212
 
191
213
  def __call__(
192
214
  self,
@@ -196,12 +218,17 @@ class GraphRulesCharacterUnifier(PipelineStep):
196
218
  ) -> Dict[str, Any]:
197
219
  import networkx as nx
198
220
 
199
- mentions = [m for m in entities if m.tag == "PER"]
200
- mentions_str = [" ".join(m.tokens) for m in mentions]
221
+ mentions = [m for m in entities if m.tag == self.character_ner_tag]
222
+ mentions_str = set(
223
+ filter(
224
+ lambda m: not m in self.ignore_lone_titles,
225
+ map(lambda m: " ".join(m.tokens), mentions),
226
+ )
227
+ )
201
228
 
202
229
  # * create a graph where each node is a mention detected by NER
203
230
  G = nx.Graph()
204
- for mention_str in set(mentions_str):
231
+ for mention_str in mentions_str:
205
232
  G.add_node(mention_str)
206
233
 
207
234
  # * HumanName local configuration - dependant on language
@@ -209,23 +236,28 @@ class GraphRulesCharacterUnifier(PipelineStep):
209
236
 
210
237
  # * link nodes based on several rules
211
238
  for name1, name2 in combinations(G.nodes(), 2):
239
+
240
+ # preprocess name when needed
241
+ pname1 = self._preprocess_name(name1)
242
+ pname2 = self._preprocess_name(name2)
243
+
212
244
  # is one name a known hypocorism of the other ? (also
213
245
  # checks if both names are the same)
214
- if self.hypocorism_gazetteer.are_related(name1, name2):
246
+ if self.hypocorism_gazetteer.are_related(pname1, pname2):
215
247
  G.add_edge(name1, name2)
216
248
  continue
217
249
 
218
250
  # if we remove the title, is one name related to the other
219
251
  # ?
220
252
  if self.names_are_related_after_title_removal(
221
- name1, name2, hname_constants
253
+ pname1, pname2, hname_constants
222
254
  ):
223
255
  G.add_edge(name1, name2)
224
256
  continue
225
257
 
226
258
  # add an edge if two characters have the same family names
227
- human_name1 = HumanName(name1, constants=hname_constants)
228
- human_name2 = HumanName(name2, constants=hname_constants)
259
+ human_name1 = HumanName(pname1, constants=hname_constants)
260
+ human_name2 = HumanName(pname2, constants=hname_constants)
229
261
  if (
230
262
  len(human_name1.last) > 0
231
263
  and human_name1.last.lower() == human_name2.last.lower()
@@ -262,10 +294,15 @@ class GraphRulesCharacterUnifier(PipelineStep):
262
294
  pass
263
295
 
264
296
  for name1, name2 in combinations(G.nodes(), 2):
297
+
298
+ # preprocess names when needed
299
+ pname1 = self._preprocess_name(name1)
300
+ pname2 = self._preprocess_name(name2)
301
+
265
302
  # check if characters have the same last name but a
266
303
  # different first name.
267
- human_name1 = HumanName(name1, constants=hname_constants)
268
- human_name2 = HumanName(name2, constants=hname_constants)
304
+ human_name1 = HumanName(pname1, constants=hname_constants)
305
+ human_name2 = HumanName(pname2, constants=hname_constants)
269
306
  if (
270
307
  len(human_name1.last) > 0
271
308
  and len(human_name2.last) > 0
@@ -317,6 +354,17 @@ class GraphRulesCharacterUnifier(PipelineStep):
317
354
 
318
355
  return {"characters": characters}
319
356
 
357
+ def _preprocess_name(self, name) -> str:
358
+ if self.ignore_leading_determiner:
359
+ if not self.lang in singular_determiners:
360
+ print(
361
+ f"[warning] can't ignore leading determiners for {self.lang}",
362
+ file=sys.stderr,
363
+ )
364
+ for determiner in singular_determiners.get(self.lang, []):
365
+ name = re.sub(f"^{determiner} ", " ", name, flags=re.I)
366
+ return name
367
+
320
368
  def _make_hname_constants(self) -> Constants:
321
369
  if self.lang == "eng":
322
370
  return Constants()
@@ -345,13 +393,18 @@ class GraphRulesCharacterUnifier(PipelineStep):
345
393
  or self.hypocorism_gazetteer.are_related(raw_name1, raw_name2)
346
394
  )
347
395
 
348
- def names_are_in_coref(self, name1: str, name2: str, corefs: List[List[Mention]]):
396
+ def names_are_in_coref(
397
+ self, name1: str, name2: str, corefs: List[List[Mention]]
398
+ ) -> bool:
399
+ once_together = False
349
400
  for coref_chain in corefs:
350
- if any([name1 == " ".join(m.tokens) for m in coref_chain]) and any(
351
- [name2 == " ".join(m.tokens) for m in coref_chain]
352
- ):
353
- return True
354
- return False
401
+ name1_in = any([name1 == " ".join(m.tokens) for m in coref_chain])
402
+ name2_in = any([name2 == " ".join(m.tokens) for m in coref_chain])
403
+ if name1_in == (not name2_in):
404
+ return False
405
+ elif name1_in and name2_in:
406
+ once_together = True
407
+ return once_together
355
408
 
356
409
  def infer_name_gender(
357
410
  self,
@@ -1,7 +1,9 @@
1
+ import sys
1
2
  import renard.pipeline.character_unification as cu
2
3
 
3
4
  print(
4
- "[warning] the characters_extraction module is deprecated. Use character_unification instead."
5
+ "[warning] the characters_extraction module is deprecated. Use character_unification instead.",
6
+ file=sys.stderr,
5
7
  )
6
8
 
7
9
  Character = cu.Character