renard-pipeline 0.6.3__tar.gz → 0.6.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of renard-pipeline might be problematic. Click here for more details.
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/PKG-INFO +23 -29
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/README.md +2 -2
- renard_pipeline-0.6.5/pyproject.toml +39 -0
- renard_pipeline-0.6.5/renard_pipeline.egg-info/PKG-INFO +117 -0
- renard_pipeline-0.6.5/renard_pipeline.egg-info/SOURCES.txt +52 -0
- renard_pipeline-0.6.5/renard_pipeline.egg-info/dependency_links.txt +1 -0
- renard_pipeline-0.6.5/renard_pipeline.egg-info/requires.txt +14 -0
- renard_pipeline-0.6.5/renard_pipeline.egg-info/top_level.txt +1 -0
- renard_pipeline-0.6.5/setup.cfg +4 -0
- renard_pipeline-0.6.5/tests/test_character_unification.py +16 -0
- renard_pipeline-0.6.5/tests/test_corefs.py +15 -0
- renard_pipeline-0.6.5/tests/test_graph_extraction.py +121 -0
- renard_pipeline-0.6.5/tests/test_graph_utils.py +14 -0
- renard_pipeline-0.6.5/tests/test_ner.py +79 -0
- renard_pipeline-0.6.5/tests/test_pipeline.py +92 -0
- renard_pipeline-0.6.5/tests/test_quote_detection.py +38 -0
- renard_pipeline-0.6.5/tests/test_sentiment_analysis.py +14 -0
- renard_pipeline-0.6.5/tests/test_stanza.py +20 -0
- renard_pipeline-0.6.5/tests/test_tokenization.py +26 -0
- renard_pipeline-0.6.5/tests/test_utils.py +15 -0
- renard_pipeline-0.6.3/pyproject.toml +0 -40
- renard_pipeline-0.6.3/renard/resources/hypocorisms/datas/License.txt +0 -201
- renard_pipeline-0.6.3/renard/resources/hypocorisms/datas/hypocorisms.csv +0 -1084
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/LICENSE +0 -0
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/renard/gender.py +0 -0
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/renard/graph_utils.py +0 -0
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/renard/ner_utils.py +0 -0
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/renard/nltk_utils.py +0 -0
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/renard/pipeline/__init__.py +0 -0
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/renard/pipeline/character_unification.py +0 -0
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/renard/pipeline/characters_extraction.py +0 -0
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/renard/pipeline/core.py +0 -0
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/renard/pipeline/corefs/__init__.py +0 -0
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/renard/pipeline/corefs/corefs.py +0 -0
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/renard/pipeline/graph_extraction.py +0 -0
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/renard/pipeline/ner/__init__.py +0 -0
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/renard/pipeline/ner/ner.py +0 -0
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/renard/pipeline/ner/retrieval.py +0 -0
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/renard/pipeline/preconfigured.py +0 -0
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/renard/pipeline/preprocessing.py +0 -0
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/renard/pipeline/progress.py +0 -0
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/renard/pipeline/quote_detection.py +0 -0
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/renard/pipeline/sentiment_analysis.py +0 -0
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/renard/pipeline/speaker_attribution.py +0 -0
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/renard/pipeline/stanford_corenlp.py +0 -0
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/renard/pipeline/tokenization.py +0 -0
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/renard/plot_utils.py +0 -0
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/renard/py.typed +0 -0
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/renard/resources/determiners/__init__.py +0 -0
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/renard/resources/determiners/determiners.py +0 -0
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/renard/resources/hypocorisms/__init__.py +0 -0
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/renard/resources/hypocorisms/hypocorisms.py +0 -0
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/renard/resources/pronouns/__init__.py +0 -0
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/renard/resources/pronouns/pronouns.py +0 -0
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/renard/resources/titles/__init__.py +0 -0
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/renard/resources/titles/titles.py +0 -0
- {renard_pipeline-0.6.3 → renard_pipeline-0.6.5}/renard/utils.py +0 -0
|
@@ -1,35 +1,30 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: renard-pipeline
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.5
|
|
4
4
|
Summary: Relationships Extraction from NARrative Documents
|
|
5
|
-
|
|
5
|
+
Author-email: Arthur Amalvy <arthur.amalvy@univ-avignon.fr>
|
|
6
6
|
License: GPL-3.0-only
|
|
7
|
-
|
|
8
|
-
Author-email: arthur.amalvy@univ-avignon.fr
|
|
9
|
-
Requires-Python: >=3.8,<3.12
|
|
10
|
-
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
11
|
-
Classifier: Programming Language :: Python :: 3
|
|
12
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
-
Requires-Dist: datasets (>=3.0,<4.0)
|
|
17
|
-
Requires-Dist: grimbert (>=0.1,<0.2)
|
|
18
|
-
Requires-Dist: matplotlib (>=3.5,<4.0)
|
|
19
|
-
Requires-Dist: more-itertools (>=10.5,<11.0)
|
|
20
|
-
Requires-Dist: nameparser (>=1.1,<2.0)
|
|
21
|
-
Requires-Dist: networkx (>=3.0,<4.0)
|
|
22
|
-
Requires-Dist: nltk (>=3.9,<4.0)
|
|
23
|
-
Requires-Dist: pandas (>=2.0,<3.0)
|
|
24
|
-
Requires-Dist: pytest (>=8.3.0,<9.0.0)
|
|
25
|
-
Requires-Dist: rank-bm25 (>=0.2.2,<0.3.0)
|
|
26
|
-
Requires-Dist: tibert (>=0.5,<0.6)
|
|
27
|
-
Requires-Dist: torch (>=2.0.0,!=2.0.1)
|
|
28
|
-
Requires-Dist: tqdm (>=4.62.3,<5.0.0)
|
|
29
|
-
Requires-Dist: transformers (>=4.37,<5.0)
|
|
7
|
+
Project-URL: Homepage, https://github.com/CompNet/Renard
|
|
30
8
|
Project-URL: Documentation, https://compnet.github.io/Renard/
|
|
31
9
|
Project-URL: Repository, https://github.com/CompNet/Renard
|
|
10
|
+
Requires-Python: <3.13,>=3.9
|
|
32
11
|
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: torch!=2.0.1,>=2.0.0
|
|
14
|
+
Requires-Dist: transformers>=4.37
|
|
15
|
+
Requires-Dist: nltk>=3.9
|
|
16
|
+
Requires-Dist: tqdm>=4.62
|
|
17
|
+
Requires-Dist: networkx>=3.0
|
|
18
|
+
Requires-Dist: more-itertools>=10.5
|
|
19
|
+
Requires-Dist: nameparser>=1.1
|
|
20
|
+
Requires-Dist: matplotlib>=3.5
|
|
21
|
+
Requires-Dist: pandas>=2.0
|
|
22
|
+
Requires-Dist: pytest>=8.3.0
|
|
23
|
+
Requires-Dist: tibert>=0.5
|
|
24
|
+
Requires-Dist: grimbert>=0.1
|
|
25
|
+
Requires-Dist: datasets>=3.0
|
|
26
|
+
Requires-Dist: rank-bm25>=0.2.2
|
|
27
|
+
Dynamic: license-file
|
|
33
28
|
|
|
34
29
|
# Renard
|
|
35
30
|
|
|
@@ -46,7 +41,7 @@ You can install the latest version using pip:
|
|
|
46
41
|
|
|
47
42
|
> pip install renard-pipeline
|
|
48
43
|
|
|
49
|
-
Currently, Renard supports Python>=3.
|
|
44
|
+
Currently, Renard supports Python>=3.9,<=3.12
|
|
50
45
|
|
|
51
46
|
|
|
52
47
|
# Documentation
|
|
@@ -90,7 +85,7 @@ For more information, see `renard_tutorial.py`, which is a tutorial in the `jupy
|
|
|
90
85
|
|
|
91
86
|
`Renard` uses `pytest` for testing. To launch tests, use the following command :
|
|
92
87
|
|
|
93
|
-
>
|
|
88
|
+
> uv run python -m pytest tests
|
|
94
89
|
|
|
95
90
|
Expensive tests are disabled by default. These can be run by setting the environment variable `RENARD_TEST_ALL` to `1`.
|
|
96
91
|
|
|
@@ -120,4 +115,3 @@ If you use Renard in your research project, please cite it as follows:
|
|
|
120
115
|
```
|
|
121
116
|
|
|
122
117
|
We would be happy to hear about your usage of Renard, so don't hesitate to reach out!
|
|
123
|
-
|
|
@@ -13,7 +13,7 @@ You can install the latest version using pip:
|
|
|
13
13
|
|
|
14
14
|
> pip install renard-pipeline
|
|
15
15
|
|
|
16
|
-
Currently, Renard supports Python>=3.
|
|
16
|
+
Currently, Renard supports Python>=3.9,<=3.12
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
# Documentation
|
|
@@ -57,7 +57,7 @@ For more information, see `renard_tutorial.py`, which is a tutorial in the `jupy
|
|
|
57
57
|
|
|
58
58
|
`Renard` uses `pytest` for testing. To launch tests, use the following command :
|
|
59
59
|
|
|
60
|
-
>
|
|
60
|
+
> uv run python -m pytest tests
|
|
61
61
|
|
|
62
62
|
Expensive tests are disabled by default. These can be run by setting the environment variable `RENARD_TEST_ALL` to `1`.
|
|
63
63
|
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "renard-pipeline"
|
|
3
|
+
version = "0.6.5"
|
|
4
|
+
description = "Relationships Extraction from NARrative Documents"
|
|
5
|
+
authors = [
|
|
6
|
+
{name = "Arthur Amalvy", email = "arthur.amalvy@univ-avignon.fr"},
|
|
7
|
+
]
|
|
8
|
+
license = { text = "GPL-3.0-only" }
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9,<3.13"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"torch>=2.0.0,!=2.0.1",
|
|
13
|
+
"transformers>=4.37",
|
|
14
|
+
"nltk>=3.9",
|
|
15
|
+
"tqdm>=4.62",
|
|
16
|
+
"networkx>=3.0",
|
|
17
|
+
"more-itertools>=10.5",
|
|
18
|
+
"nameparser>=1.1",
|
|
19
|
+
"matplotlib>=3.5",
|
|
20
|
+
"pandas>=2.0",
|
|
21
|
+
"pytest>=8.3.0",
|
|
22
|
+
"tibert>=0.5",
|
|
23
|
+
"grimbert>=0.1",
|
|
24
|
+
"datasets>=3.0",
|
|
25
|
+
"rank-bm25>=0.2.2",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
[project.urls]
|
|
29
|
+
Homepage = "https://github.com/CompNet/Renard"
|
|
30
|
+
Documentation = "https://compnet.github.io/Renard/"
|
|
31
|
+
Repository = "https://github.com/CompNet/Renard"
|
|
32
|
+
|
|
33
|
+
[dependency-groups]
|
|
34
|
+
dev = [
|
|
35
|
+
"hypothesis>=6.82",
|
|
36
|
+
"Sphinx>=4.3",
|
|
37
|
+
"sphinx-rtd-theme>=1.0.0",
|
|
38
|
+
"sphinx-autodoc-typehints>=1.12.0",
|
|
39
|
+
]
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: renard-pipeline
|
|
3
|
+
Version: 0.6.5
|
|
4
|
+
Summary: Relationships Extraction from NARrative Documents
|
|
5
|
+
Author-email: Arthur Amalvy <arthur.amalvy@univ-avignon.fr>
|
|
6
|
+
License: GPL-3.0-only
|
|
7
|
+
Project-URL: Homepage, https://github.com/CompNet/Renard
|
|
8
|
+
Project-URL: Documentation, https://compnet.github.io/Renard/
|
|
9
|
+
Project-URL: Repository, https://github.com/CompNet/Renard
|
|
10
|
+
Requires-Python: <3.13,>=3.9
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: torch!=2.0.1,>=2.0.0
|
|
14
|
+
Requires-Dist: transformers>=4.37
|
|
15
|
+
Requires-Dist: nltk>=3.9
|
|
16
|
+
Requires-Dist: tqdm>=4.62
|
|
17
|
+
Requires-Dist: networkx>=3.0
|
|
18
|
+
Requires-Dist: more-itertools>=10.5
|
|
19
|
+
Requires-Dist: nameparser>=1.1
|
|
20
|
+
Requires-Dist: matplotlib>=3.5
|
|
21
|
+
Requires-Dist: pandas>=2.0
|
|
22
|
+
Requires-Dist: pytest>=8.3.0
|
|
23
|
+
Requires-Dist: tibert>=0.5
|
|
24
|
+
Requires-Dist: grimbert>=0.1
|
|
25
|
+
Requires-Dist: datasets>=3.0
|
|
26
|
+
Requires-Dist: rank-bm25>=0.2.2
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
|
|
29
|
+
# Renard
|
|
30
|
+
|
|
31
|
+
[](https://doi.org/10.21105/joss.06574)
|
|
32
|
+
|
|
33
|
+
Renard (Relationship Extraction from NARrative Documents) is a library for creating and using custom character networks extraction pipelines. Renard can extract dynamic as well as static character networks.
|
|
34
|
+
|
|
35
|
+

|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# Installation
|
|
39
|
+
|
|
40
|
+
You can install the latest version using pip:
|
|
41
|
+
|
|
42
|
+
> pip install renard-pipeline
|
|
43
|
+
|
|
44
|
+
Currently, Renard supports Python>=3.9,<=3.12
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# Documentation
|
|
48
|
+
|
|
49
|
+
Documentation, including installation instructions, can be found at https://compnet.github.io/Renard/
|
|
50
|
+
|
|
51
|
+
If you need local documentation, it can be generated using `Sphinx`. From the `docs` directory, `make html` should create documentation under `docs/_build/html`.
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# Tutorial
|
|
55
|
+
|
|
56
|
+
Renard's central concept is the `Pipeline`.A `Pipeline` is a list of `PipelineStep` that are run sequentially in order to extract a character graph from a document. Here is a simple example:
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
from renard.pipeline import Pipeline
|
|
60
|
+
from renard.pipeline.tokenization import NLTKTokenizer
|
|
61
|
+
from renard.pipeline.ner import NLTKNamedEntityRecognizer
|
|
62
|
+
from renard.pipeline.character_unification import GraphRulesCharacterUnifier
|
|
63
|
+
from renard.pipeline.graph_extraction import CoOccurrencesGraphExtractor
|
|
64
|
+
|
|
65
|
+
with open("./my_doc.txt") as f:
|
|
66
|
+
text = f.read()
|
|
67
|
+
|
|
68
|
+
pipeline = Pipeline(
|
|
69
|
+
[
|
|
70
|
+
NLTKTokenizer(),
|
|
71
|
+
NLTKNamedEntityRecognizer(),
|
|
72
|
+
GraphRulesCharacterUnifier(min_appearance=10),
|
|
73
|
+
CoOccurrencesGraphExtractor(co_occurrences_dist=25)
|
|
74
|
+
]
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
out = pipeline(text)
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
For more information, see `renard_tutorial.py`, which is a tutorial in the `jupytext` format. You can open it as a notebook in Jupyter Notebook (or export it as a notebook with `jupytext --to ipynb renard-tutorial.py`).
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
# Running tests
|
|
85
|
+
|
|
86
|
+
`Renard` uses `pytest` for testing. To launch tests, use the following command :
|
|
87
|
+
|
|
88
|
+
> uv run python -m pytest tests
|
|
89
|
+
|
|
90
|
+
Expensive tests are disabled by default. These can be run by setting the environment variable `RENARD_TEST_ALL` to `1`.
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
# Contributing
|
|
94
|
+
|
|
95
|
+
see [the "Contributing" section of the documentation](https://compnet.github.io/Renard/contributing.html).
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
# How to cite
|
|
99
|
+
|
|
100
|
+
If you use Renard in your research project, please cite it as follows:
|
|
101
|
+
|
|
102
|
+
```bibtex
|
|
103
|
+
@Article{Amalvy2024,
|
|
104
|
+
doi = {10.21105/joss.06574},
|
|
105
|
+
year = {2024},
|
|
106
|
+
publisher = {The Open Journal},
|
|
107
|
+
volume = {9},
|
|
108
|
+
number = {98},
|
|
109
|
+
pages = {6574},
|
|
110
|
+
author = {Amalvy, A. and Labatut, V. and Dufour, R.},
|
|
111
|
+
title = {Renard: A Modular Pipeline for Extracting Character
|
|
112
|
+
Networks from Narrative Texts},
|
|
113
|
+
journal = {Journal of Open Source Software},
|
|
114
|
+
}
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
We would be happy to hear about your usage of Renard, so don't hesitate to reach out!
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
renard/gender.py
|
|
5
|
+
renard/graph_utils.py
|
|
6
|
+
renard/ner_utils.py
|
|
7
|
+
renard/nltk_utils.py
|
|
8
|
+
renard/plot_utils.py
|
|
9
|
+
renard/py.typed
|
|
10
|
+
renard/utils.py
|
|
11
|
+
renard/pipeline/__init__.py
|
|
12
|
+
renard/pipeline/character_unification.py
|
|
13
|
+
renard/pipeline/characters_extraction.py
|
|
14
|
+
renard/pipeline/core.py
|
|
15
|
+
renard/pipeline/graph_extraction.py
|
|
16
|
+
renard/pipeline/preconfigured.py
|
|
17
|
+
renard/pipeline/preprocessing.py
|
|
18
|
+
renard/pipeline/progress.py
|
|
19
|
+
renard/pipeline/quote_detection.py
|
|
20
|
+
renard/pipeline/sentiment_analysis.py
|
|
21
|
+
renard/pipeline/speaker_attribution.py
|
|
22
|
+
renard/pipeline/stanford_corenlp.py
|
|
23
|
+
renard/pipeline/tokenization.py
|
|
24
|
+
renard/pipeline/corefs/__init__.py
|
|
25
|
+
renard/pipeline/corefs/corefs.py
|
|
26
|
+
renard/pipeline/ner/__init__.py
|
|
27
|
+
renard/pipeline/ner/ner.py
|
|
28
|
+
renard/pipeline/ner/retrieval.py
|
|
29
|
+
renard/resources/determiners/__init__.py
|
|
30
|
+
renard/resources/determiners/determiners.py
|
|
31
|
+
renard/resources/hypocorisms/__init__.py
|
|
32
|
+
renard/resources/hypocorisms/hypocorisms.py
|
|
33
|
+
renard/resources/pronouns/__init__.py
|
|
34
|
+
renard/resources/pronouns/pronouns.py
|
|
35
|
+
renard/resources/titles/__init__.py
|
|
36
|
+
renard/resources/titles/titles.py
|
|
37
|
+
renard_pipeline.egg-info/PKG-INFO
|
|
38
|
+
renard_pipeline.egg-info/SOURCES.txt
|
|
39
|
+
renard_pipeline.egg-info/dependency_links.txt
|
|
40
|
+
renard_pipeline.egg-info/requires.txt
|
|
41
|
+
renard_pipeline.egg-info/top_level.txt
|
|
42
|
+
tests/test_character_unification.py
|
|
43
|
+
tests/test_corefs.py
|
|
44
|
+
tests/test_graph_extraction.py
|
|
45
|
+
tests/test_graph_utils.py
|
|
46
|
+
tests/test_ner.py
|
|
47
|
+
tests/test_pipeline.py
|
|
48
|
+
tests/test_quote_detection.py
|
|
49
|
+
tests/test_sentiment_analysis.py
|
|
50
|
+
tests/test_stanza.py
|
|
51
|
+
tests/test_tokenization.py
|
|
52
|
+
tests/test_utils.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
renard
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from renard.pipeline import Mention
|
|
2
|
+
from renard.pipeline.character_unification import (
|
|
3
|
+
Character,
|
|
4
|
+
_assign_coreference_mentions,
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def test_assign_coreference_mentions():
|
|
9
|
+
characters = _assign_coreference_mentions(
|
|
10
|
+
[Character(frozenset(["John Traitor"]), [Mention(["John", "Traitor"], 0, 1)])],
|
|
11
|
+
[[Mention(["John", "Traitor"], 0, 1), Mention(["He"], 10, 10)]],
|
|
12
|
+
)
|
|
13
|
+
assert characters[0] == Character(
|
|
14
|
+
frozenset(["John Traitor"]),
|
|
15
|
+
[Mention(["John", "Traitor"], 0, 1), Mention(["He"], 10, 10)],
|
|
16
|
+
)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pytest
|
|
3
|
+
from renard.pipeline import Pipeline
|
|
4
|
+
from renard.pipeline.corefs import BertCoreferenceResolver
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@pytest.mark.skipif(os.getenv("RENARD_TEST_ALL") != "1", reason="performance")
|
|
8
|
+
def test_bert_coreference_resolver_runs():
|
|
9
|
+
pipeline = Pipeline([BertCoreferenceResolver()], progress_report=None)
|
|
10
|
+
tokens = "Princess Liana felt sad , because Zarth Arn was gone . The princess went to sleep .".split(
|
|
11
|
+
" "
|
|
12
|
+
)
|
|
13
|
+
corefs = pipeline(tokens=tokens).corefs
|
|
14
|
+
assert not corefs is None
|
|
15
|
+
assert len(corefs) >= 1
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from typing import List
|
|
3
|
+
import itertools, string
|
|
4
|
+
from hypothesis import given
|
|
5
|
+
from hypothesis.strategies import lists, sampled_from
|
|
6
|
+
from hypothesis.strategies._internal.numbers import integers
|
|
7
|
+
import networkx as nx
|
|
8
|
+
from networkx.algorithms import isomorphism
|
|
9
|
+
from renard.pipeline.graph_extraction import CoOccurrencesGraphExtractor
|
|
10
|
+
from renard.pipeline.character_unification import Character
|
|
11
|
+
from renard.pipeline.ner import ner_entities, NEREntity
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _characters_from_mentions(mentions: List[NEREntity]) -> List[Character]:
|
|
15
|
+
"""Generate characters from a list of mentions"""
|
|
16
|
+
name_to_mentions = defaultdict(list)
|
|
17
|
+
for mention in mentions:
|
|
18
|
+
name_to_mentions[" ".join(mention.tokens)].append(mention)
|
|
19
|
+
return [
|
|
20
|
+
Character(frozenset((name,)), mentions)
|
|
21
|
+
for name, mentions in name_to_mentions.items()
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# max size used for performance reasons
|
|
26
|
+
@given(lists(sampled_from(string.ascii_uppercase), max_size=7))
|
|
27
|
+
def test_basic_graph_extraction(tokens: List[str]):
|
|
28
|
+
bio_tags = ["B-PER" for _ in tokens]
|
|
29
|
+
|
|
30
|
+
mentions = ner_entities(tokens, bio_tags)
|
|
31
|
+
characters = _characters_from_mentions(mentions)
|
|
32
|
+
|
|
33
|
+
graph_extractor = CoOccurrencesGraphExtractor(len(tokens))
|
|
34
|
+
out = graph_extractor(set(characters), [tokens])
|
|
35
|
+
|
|
36
|
+
characters = {
|
|
37
|
+
token: Character(
|
|
38
|
+
frozenset([token]), [m for m in mentions if m.tokens[0] == token]
|
|
39
|
+
)
|
|
40
|
+
for token in set(tokens)
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
G = nx.Graph()
|
|
44
|
+
for character in characters.values():
|
|
45
|
+
G.add_node(character)
|
|
46
|
+
|
|
47
|
+
for i, j in itertools.combinations(range(len(tokens)), 2):
|
|
48
|
+
A = characters[tokens[i]]
|
|
49
|
+
B = characters[tokens[j]]
|
|
50
|
+
if A == B:
|
|
51
|
+
continue
|
|
52
|
+
if not G.has_edge(A, B):
|
|
53
|
+
G.add_edge(A, B, weight=0)
|
|
54
|
+
G.edges[A, B]["weight"] += 1
|
|
55
|
+
|
|
56
|
+
assert nx.is_isomorphic(
|
|
57
|
+
out["character_network"],
|
|
58
|
+
G,
|
|
59
|
+
edge_match=isomorphism.numerical_edge_match("weight", 0),
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@given(
|
|
64
|
+
lists(sampled_from(string.ascii_uppercase), min_size=1),
|
|
65
|
+
integers(min_value=1, max_value=5),
|
|
66
|
+
)
|
|
67
|
+
def test_dynamic_graph_extraction(tokens: List[str], dynamic_window: int):
|
|
68
|
+
"""
|
|
69
|
+
.. note::
|
|
70
|
+
|
|
71
|
+
only tests execution.
|
|
72
|
+
"""
|
|
73
|
+
bio_tags = ["B-PER" for _ in tokens]
|
|
74
|
+
|
|
75
|
+
mentions = ner_entities(tokens, bio_tags)
|
|
76
|
+
characters = _characters_from_mentions(mentions)
|
|
77
|
+
|
|
78
|
+
graph_extractor = CoOccurrencesGraphExtractor(
|
|
79
|
+
len(tokens), dynamic=True, dynamic_window=dynamic_window
|
|
80
|
+
)
|
|
81
|
+
out = graph_extractor(set(characters), [tokens])
|
|
82
|
+
|
|
83
|
+
assert len(out["character_network"]) > 0
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@given(lists(sampled_from(string.ascii_uppercase)))
|
|
87
|
+
def test_polarity_extraction(tokens: List[str]):
|
|
88
|
+
graph_extractor = CoOccurrencesGraphExtractor(10)
|
|
89
|
+
|
|
90
|
+
bio_tags = ["B-PER"] * len(tokens)
|
|
91
|
+
|
|
92
|
+
mentions = ner_entities(tokens, bio_tags)
|
|
93
|
+
characters = _characters_from_mentions(mentions)
|
|
94
|
+
|
|
95
|
+
out = graph_extractor(
|
|
96
|
+
set(characters),
|
|
97
|
+
sentences=[tokens],
|
|
98
|
+
sentences_polarities=[1.0],
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
for character1, character2 in itertools.combinations(characters, 2):
|
|
102
|
+
if out["character_network"].has_edge(character1, character2):
|
|
103
|
+
assert "polarity" in out["character_network"].edges[character1, character2]
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
@given(lists(sampled_from(string.ascii_uppercase), min_size=1))
|
|
107
|
+
def test_sent_co_occurence_dist(sent1: List[str]):
|
|
108
|
+
# sent2 is guaranteed to be different from sent1, so that we
|
|
109
|
+
# have 2 different characters
|
|
110
|
+
sent2 = [chr(ord(token) + 1) for token in sent1]
|
|
111
|
+
|
|
112
|
+
graph_extractor = CoOccurrencesGraphExtractor((1, "sentences"))
|
|
113
|
+
|
|
114
|
+
sentences = [sent1, sent2]
|
|
115
|
+
tokens = sent1 + sent2
|
|
116
|
+
tags = ["B-PER"] * len(tokens)
|
|
117
|
+
characters = _characters_from_mentions(ner_entities(tokens, tags))
|
|
118
|
+
|
|
119
|
+
out = graph_extractor(set(characters), sentences)
|
|
120
|
+
|
|
121
|
+
assert len(out["character_network"]) > 0
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import networkx as nx
|
|
2
|
+
from renard.graph_utils import cumulative_graph
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def test_cumulative_graph():
|
|
6
|
+
gs = [
|
|
7
|
+
nx.Graph([(0, 1, {"weight": 1})]),
|
|
8
|
+
nx.Graph([(0, 1, {"weight": 1}), (0, 2, {"weight": 1})]),
|
|
9
|
+
]
|
|
10
|
+
|
|
11
|
+
assert nx.is_isomorphic(
|
|
12
|
+
cumulative_graph(gs)[-1],
|
|
13
|
+
nx.Graph([(0, 1, {"weight": 2}), (0, 2, {"weight": 1})]),
|
|
14
|
+
)
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
from typing import List, Type
|
|
2
|
+
import string, os
|
|
3
|
+
import pytest
|
|
4
|
+
from hypothesis import given
|
|
5
|
+
from hypothesis.control import assume
|
|
6
|
+
from hypothesis.strategies import lists, sampled_from
|
|
7
|
+
from transformers import BertTokenizerFast
|
|
8
|
+
from renard.ner_utils import NERDataset
|
|
9
|
+
from renard.pipeline.ner import ner_entities, score_ner
|
|
10
|
+
from renard.pipeline.ner.retrieval import (
|
|
11
|
+
NERBM25ContextRetriever,
|
|
12
|
+
NERContextRetriever,
|
|
13
|
+
NEREnsembleContextRetriever,
|
|
14
|
+
NERNeighborsContextRetriever,
|
|
15
|
+
NERSamenounContextRetriever,
|
|
16
|
+
NERNeuralContextRetriever,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@pytest.mark.skipif(
|
|
21
|
+
os.getenv("RENARD_TEST_SEQEVAL_OPTDEP") != "1",
|
|
22
|
+
reason="not testing seqeval based functions",
|
|
23
|
+
)
|
|
24
|
+
@given(lists(sampled_from(("B-PER", "I-PER", "O")), min_size=1))
|
|
25
|
+
def test_score_same_tags(tags: List[str]):
|
|
26
|
+
assume("B-PER" in tags)
|
|
27
|
+
assert (1.0, 1.0, 1.0) == score_ner(tags, tags)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@given(lists(sampled_from(string.ascii_uppercase)))
|
|
31
|
+
def test_has_correct_number_of_entities(tokens: List[str]):
|
|
32
|
+
bio_tags = ["B-PER" for _ in tokens]
|
|
33
|
+
entities = ner_entities(tokens, bio_tags)
|
|
34
|
+
assert len(entities) == len(tokens)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@pytest.mark.skipif(os.getenv("RENARD_TEST_ALL") != "1", reason="performance")
|
|
38
|
+
@pytest.mark.parametrize(
|
|
39
|
+
"retriever_class", [NERSamenounContextRetriever, NERBM25ContextRetriever]
|
|
40
|
+
)
|
|
41
|
+
def test_retrieves_context(retriever_class: Type[NERContextRetriever]):
|
|
42
|
+
context_retriever = retriever_class(1)
|
|
43
|
+
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")
|
|
44
|
+
sentences = [
|
|
45
|
+
"this is some test sentence .".split(" "),
|
|
46
|
+
"this is another test sentence .".split(" "),
|
|
47
|
+
]
|
|
48
|
+
dataset = NERDataset(sentences, tokenizer)
|
|
49
|
+
ctx_dataset = context_retriever(dataset)
|
|
50
|
+
assert ctx_dataset.elements[0] == sentences[0] + sentences[1]
|
|
51
|
+
assert ctx_dataset.elements[1] == sentences[0] + sentences[1]
|
|
52
|
+
assert len(ctx_dataset.elements) == len(sentences)
|
|
53
|
+
assert len(ctx_dataset._context_mask) == len(sentences)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@pytest.mark.skipif(os.getenv("RENARD_TEST_ALL") != "1", reason="performance")
|
|
57
|
+
def test_neural_retrieves_context():
|
|
58
|
+
context_retriever = NERNeuralContextRetriever(
|
|
59
|
+
NEREnsembleContextRetriever(
|
|
60
|
+
[
|
|
61
|
+
NERSamenounContextRetriever(1),
|
|
62
|
+
NERBM25ContextRetriever(1),
|
|
63
|
+
NERNeighborsContextRetriever(2),
|
|
64
|
+
],
|
|
65
|
+
k=4,
|
|
66
|
+
),
|
|
67
|
+
k=1,
|
|
68
|
+
)
|
|
69
|
+
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")
|
|
70
|
+
sentences = [
|
|
71
|
+
"this is some test sentence .".split(" "),
|
|
72
|
+
"this is another test sentence .".split(" "),
|
|
73
|
+
]
|
|
74
|
+
dataset = NERDataset(sentences, tokenizer)
|
|
75
|
+
ctx_dataset = context_retriever(dataset)
|
|
76
|
+
assert ctx_dataset.elements[0] == sentences[0] + sentences[1]
|
|
77
|
+
assert ctx_dataset.elements[1] == sentences[0] + sentences[1]
|
|
78
|
+
assert len(ctx_dataset.elements) == len(sentences)
|
|
79
|
+
assert len(ctx_dataset._context_mask) == len(sentences)
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
from typing import Set
|
|
2
|
+
import os
|
|
3
|
+
import pytest
|
|
4
|
+
from renard.pipeline.core import Pipeline, PipelineStep
|
|
5
|
+
from renard.pipeline.preconfigured import bert_pipeline, nltk_pipeline
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
script_dir = os.path.abspath(os.path.dirname(__file__))
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def test_pipeline_is_valid():
|
|
12
|
+
class TestPipelineStep1(PipelineStep):
|
|
13
|
+
def needs(self) -> Set[str]:
|
|
14
|
+
return set()
|
|
15
|
+
|
|
16
|
+
def production(self) -> Set[str]:
|
|
17
|
+
return {"info_1"}
|
|
18
|
+
|
|
19
|
+
class TestPipelineStep2(PipelineStep):
|
|
20
|
+
def needs(self) -> Set[str]:
|
|
21
|
+
return {"info_1"}
|
|
22
|
+
|
|
23
|
+
def production(self) -> Set[str]:
|
|
24
|
+
return set()
|
|
25
|
+
|
|
26
|
+
pipeline = Pipeline([TestPipelineStep1(), TestPipelineStep2()])
|
|
27
|
+
|
|
28
|
+
assert pipeline.check_valid()[0]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_pipeline_is_invalid():
|
|
32
|
+
class TestPipelineStep1(PipelineStep):
|
|
33
|
+
def needs(self) -> Set[str]:
|
|
34
|
+
return set()
|
|
35
|
+
|
|
36
|
+
def production(self) -> Set[str]:
|
|
37
|
+
return set()
|
|
38
|
+
|
|
39
|
+
class TestPipelineStep2(PipelineStep):
|
|
40
|
+
def needs(self) -> Set[str]:
|
|
41
|
+
return {"info_1"}
|
|
42
|
+
|
|
43
|
+
def production(self) -> Set[str]:
|
|
44
|
+
return set()
|
|
45
|
+
|
|
46
|
+
pipeline = Pipeline([TestPipelineStep1(), TestPipelineStep2()])
|
|
47
|
+
|
|
48
|
+
assert not pipeline.check_valid()[0]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@pytest.mark.skipif(os.getenv("RENARD_TEST_ALL") != "1", reason="performance")
|
|
52
|
+
def test_nltk_pipeline_runs():
|
|
53
|
+
with open(f"{script_dir}/pp_chapter1.txt") as f:
|
|
54
|
+
text = f.read()
|
|
55
|
+
pipeline = nltk_pipeline(
|
|
56
|
+
warn=False,
|
|
57
|
+
progress_report=None,
|
|
58
|
+
graph_extractor_kwargs={"co_occurrences_dist": (1, "sentences")},
|
|
59
|
+
)
|
|
60
|
+
pipeline(text)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@pytest.mark.skipif(os.getenv("RENARD_TEST_ALL") != "1", reason="performance")
|
|
64
|
+
def test_bert_pipeline_runs():
|
|
65
|
+
with open(f"{script_dir}/pp_chapter1.txt") as f:
|
|
66
|
+
text = f.read()
|
|
67
|
+
pipeline = bert_pipeline(
|
|
68
|
+
warn=False,
|
|
69
|
+
progress_report=None,
|
|
70
|
+
graph_extractor_kwargs={"co_occurrences_dist": (1, "sentences")},
|
|
71
|
+
)
|
|
72
|
+
pipeline(text)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@pytest.mark.skipif(os.getenv("RENARD_TEST_ALL") != "1", reason="performance")
|
|
76
|
+
def test_conversational_pipeline_runs():
|
|
77
|
+
|
|
78
|
+
with open(f"{script_dir}/pp_chapter1.txt") as f:
|
|
79
|
+
text = f.read()
|
|
80
|
+
# if the text is too long, speaker attribution takes a long time
|
|
81
|
+
text = text[:500]
|
|
82
|
+
|
|
83
|
+
pipeline = nltk_pipeline(
|
|
84
|
+
warn=False,
|
|
85
|
+
progress_report=None,
|
|
86
|
+
conversational=True,
|
|
87
|
+
graph_extractor_kwargs={
|
|
88
|
+
"graph_type": "conversation",
|
|
89
|
+
"conversation_dist": (3, "sentences"),
|
|
90
|
+
},
|
|
91
|
+
)
|
|
92
|
+
pipeline(text)
|