renard-pipeline 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of renard-pipeline might be problematic. Click here for more details.
- renard/pipeline/character_unification.py +12 -2
- renard/pipeline/core.py +7 -0
- {renard_pipeline-0.4.0.dist-info → renard_pipeline-0.4.2.dist-info}/METADATA +32 -2
- {renard_pipeline-0.4.0.dist-info → renard_pipeline-0.4.2.dist-info}/RECORD +6 -6
- {renard_pipeline-0.4.0.dist-info → renard_pipeline-0.4.2.dist-info}/WHEEL +1 -1
- {renard_pipeline-0.4.0.dist-info → renard_pipeline-0.4.2.dist-info}/LICENSE +0 -0
|
@@ -159,6 +159,7 @@ class GraphRulesCharacterUnifier(PipelineStep):
|
|
|
159
159
|
min_appearances: int = 0,
|
|
160
160
|
additional_hypocorisms: Optional[List[Tuple[str, List[str]]]] = None,
|
|
161
161
|
link_corefs_mentions: bool = False,
|
|
162
|
+
ignore_lone_titles: Optional[Set[str]] = None,
|
|
162
163
|
) -> None:
|
|
163
164
|
"""
|
|
164
165
|
:param min_appearances: minimum number of appearances of a
|
|
@@ -173,10 +174,14 @@ class GraphRulesCharacterUnifier(PipelineStep):
|
|
|
173
174
|
extract a lot of spurious links. However, linking by
|
|
174
175
|
coref is sometimes the only way to resolve a character
|
|
175
176
|
alias.
|
|
177
|
+
:param ignore_lone_titles: a set of titles to ignore when
|
|
178
|
+
they stand on their own. This avoids extracting false
|
|
179
|
+
positives characters such as 'Mr.' or 'Miss'.
|
|
176
180
|
"""
|
|
177
181
|
self.min_appearances = min_appearances
|
|
178
182
|
self.additional_hypocorisms = additional_hypocorisms
|
|
179
183
|
self.link_corefs_mentions = link_corefs_mentions
|
|
184
|
+
self.ignore_lone_titles = ignore_lone_titles or set()
|
|
180
185
|
|
|
181
186
|
super().__init__()
|
|
182
187
|
|
|
@@ -197,11 +202,16 @@ class GraphRulesCharacterUnifier(PipelineStep):
|
|
|
197
202
|
import networkx as nx
|
|
198
203
|
|
|
199
204
|
mentions = [m for m in entities if m.tag == "PER"]
|
|
200
|
-
mentions_str =
|
|
205
|
+
mentions_str = set(
|
|
206
|
+
filter(
|
|
207
|
+
lambda m: not m in self.ignore_lone_titles,
|
|
208
|
+
map(lambda m: " ".join(m.tokens), mentions),
|
|
209
|
+
)
|
|
210
|
+
)
|
|
201
211
|
|
|
202
212
|
# * create a graph where each node is a mention detected by NER
|
|
203
213
|
G = nx.Graph()
|
|
204
|
-
for mention_str in
|
|
214
|
+
for mention_str in mentions_str:
|
|
205
215
|
G.add_node(mention_str)
|
|
206
216
|
|
|
207
217
|
# * HumanName local configuration - dependant on language
|
renard/pipeline/core.py
CHANGED
|
@@ -50,6 +50,13 @@ class Mention:
|
|
|
50
50
|
self_dict["end_idx"] = self.end_idx + shift
|
|
51
51
|
return self.__class__(**self_dict)
|
|
52
52
|
|
|
53
|
+
def __eq__(self, other: Mention) -> bool:
|
|
54
|
+
return (
|
|
55
|
+
self.tokens == other.tokens
|
|
56
|
+
and self.start_idx == other.start_idx
|
|
57
|
+
and self.end_idx == other.end_idx
|
|
58
|
+
)
|
|
59
|
+
|
|
53
60
|
def __hash__(self) -> int:
|
|
54
61
|
return hash(tuple(self.tokens) + (self.start_idx, self.end_idx))
|
|
55
62
|
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: renard-pipeline
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.2
|
|
4
4
|
Summary: Relationships Extraction from NARrative Documents
|
|
5
|
+
Home-page: https://github.com/CompNet/Renard
|
|
5
6
|
License: GPL-3.0-only
|
|
6
7
|
Author: Arthur Amalvy
|
|
7
8
|
Author-email: arthur.amalvy@univ-avignon.fr
|
|
@@ -31,6 +32,8 @@ Requires-Dist: tibert (>=0.3.0,<0.4.0)
|
|
|
31
32
|
Requires-Dist: torch (>=2.0.0,!=2.0.1)
|
|
32
33
|
Requires-Dist: tqdm (>=4.62.3,<5.0.0)
|
|
33
34
|
Requires-Dist: transformers (>=4.36.0,<5.0.0)
|
|
35
|
+
Project-URL: Documentation, https://compnet.github.io/Renard/
|
|
36
|
+
Project-URL: Repository, https://github.com/CompNet/Renard
|
|
34
37
|
Description-Content-Type: text/markdown
|
|
35
38
|
|
|
36
39
|
# Renard
|
|
@@ -46,6 +49,8 @@ You can install the latest version using pip:
|
|
|
46
49
|
|
|
47
50
|
> pip install renard-pipeline
|
|
48
51
|
|
|
52
|
+
Currently, Renard supports Python 3.8, 3.9 and 3.10.
|
|
53
|
+
|
|
49
54
|
|
|
50
55
|
# Documentation
|
|
51
56
|
|
|
@@ -56,7 +61,32 @@ If you need local documentation, it can be generated using `Sphinx`. From the `d
|
|
|
56
61
|
|
|
57
62
|
# Tutorial
|
|
58
63
|
|
|
59
|
-
|
|
64
|
+
Renard's central concept is the `Pipeline`.A `Pipeline` is a list of `PipelineStep` that are run sequentially in order to extract a character graph from a document. Here is a simple example:
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
from renard.pipeline import Pipeline
|
|
68
|
+
from renard.pipeline.tokenization import NLTKTokenizer
|
|
69
|
+
from renard.pipeline.ner import NLTKNamedEntityRecognizer
|
|
70
|
+
from renard.pipeline.character_unification import GraphRulesCharacterUnifier
|
|
71
|
+
from renard.pipeline.graph_extraction import CoOccurrencesGraphExtractor
|
|
72
|
+
|
|
73
|
+
with open("./my_doc.txt") as f:
|
|
74
|
+
text = f.read()
|
|
75
|
+
|
|
76
|
+
pipeline = Pipeline(
|
|
77
|
+
[
|
|
78
|
+
NLTKTokenizer(),
|
|
79
|
+
NLTKNamedEntityRecognizer(),
|
|
80
|
+
GraphRulesCharacterUnifier(min_appearance=10),
|
|
81
|
+
CoOccurrencesGraphExtractor(co_occurrences_dist=25)
|
|
82
|
+
]
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
out = pipeline(text)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
For more information, see `renard_tutorial.py`, which is a tutorial in the `jupytext` format. You can open it as a notebook in Jupyter Notebook (or export it as a notebook with `jupytext --to ipynb renard-tutorial.py`).
|
|
89
|
+
|
|
60
90
|
|
|
61
91
|
|
|
62
92
|
# Running tests
|
|
@@ -3,9 +3,9 @@ renard/graph_utils.py,sha256=5jwky9JgJ-WMVHfeaiXkAAQwEfhR2BFSrWhck1Qmpgo,5812
|
|
|
3
3
|
renard/ner_utils.py,sha256=jN1AQkaV0Kx-Bc0oc3SYBEmSUuKPBbzXqByOlaqH62k,11263
|
|
4
4
|
renard/nltk_utils.py,sha256=mUJiwMrEDZV4Fla7WuMR-hA_OC2ZIwSXgW_0Ew18VSo,977
|
|
5
5
|
renard/pipeline/__init__.py,sha256=8Yim2mmny8YGvM7N5-na5zK-C9UDxUb77K9ml-VirUA,35
|
|
6
|
-
renard/pipeline/character_unification.py,sha256=
|
|
6
|
+
renard/pipeline/character_unification.py,sha256=GJvPKw2zSMi0RpLLVlKsu7ewpxkrdxytND9PLxolbP4,15252
|
|
7
7
|
renard/pipeline/characters_extraction.py,sha256=NzF8H9X19diW6rqwS5ERrRku7rFueO3S077H5C6kb7I,363
|
|
8
|
-
renard/pipeline/core.py,sha256=
|
|
8
|
+
renard/pipeline/core.py,sha256=luKNUTCDtZfwKzxVIaImyIMwFFvIknfT1LdQtongj24,22570
|
|
9
9
|
renard/pipeline/corefs/__init__.py,sha256=9c9AaXBcRrDBf1jhTtJ7DyjOJhX_Zej3FjlcGak7MK8,44
|
|
10
10
|
renard/pipeline/corefs/corefs.py,sha256=nzYT6S9ify3FlgGB3FSDpAhs2UQYgW9c3CL2GRYzTms,11508
|
|
11
11
|
renard/pipeline/graph_extraction.py,sha256=n0T_nzNGiwE9bDubpPknHe7bbDhJ4ndnqmoMmyfbeWg,19468
|
|
@@ -29,7 +29,7 @@ renard/resources/pronouns/pronouns.py,sha256=YJ8hM6H8QHrF2Xx6O5blqc-Sqe1D1YFL0sR
|
|
|
29
29
|
renard/resources/titles/__init__.py,sha256=Jcg4B7stsWiAaXbFgNl_L3ICtCQmFe9bo3YjdkVL50w,45
|
|
30
30
|
renard/resources/titles/titles.py,sha256=GsFccVJuTkgDWiAqWZpFd2R9pGvFKQZBOk4RWWuWDkw,968
|
|
31
31
|
renard/utils.py,sha256=8J3swFqSi4YqhgYNXvttJ0s-DmJbl_yEYri6JpGEWH8,2340
|
|
32
|
-
renard_pipeline-0.4.
|
|
33
|
-
renard_pipeline-0.4.
|
|
34
|
-
renard_pipeline-0.4.
|
|
35
|
-
renard_pipeline-0.4.
|
|
32
|
+
renard_pipeline-0.4.2.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
33
|
+
renard_pipeline-0.4.2.dist-info/METADATA,sha256=R1ZbG6Mdk1B5Zk73QSKB-lZu7rDnvWKe3M5JiDqPFxM,3697
|
|
34
|
+
renard_pipeline-0.4.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
35
|
+
renard_pipeline-0.4.2.dist-info/RECORD,,
|
|
File without changes
|