complex-evaluate 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- complex_evaluate-0.0.1/PKG-INFO +121 -0
- complex_evaluate-0.0.1/README.md +110 -0
- complex_evaluate-0.0.1/pyproject.toml +17 -0
- complex_evaluate-0.0.1/src/complex_evaluate/__init__.py +0 -0
- complex_evaluate-0.0.1/src/complex_evaluate/evaluate.py +200 -0
- complex_evaluate-0.0.1/src/complex_evaluate/uted.py +158 -0
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: complex_evaluate
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Package to evaluate complex alignments.
|
|
5
|
+
Author: Guilherme Henrique
|
|
6
|
+
Author-email: Guilherme Henrique <guihss.cs@gmail.com>
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Project-URL: Homepage, https://github.com/guihcs/complex_evaluate
|
|
9
|
+
Project-URL: Issues, https://github.com/guihcs/complex_evaluate/issues
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
|
|
12
|
+
# Complex Evaluate
|
|
13
|
+
|
|
14
|
+
[](https://www.python.org/downloads/)
|
|
15
|
+
[](LICENSE)
|
|
16
|
+
[](https://github.com/guihcs/complex_evaluate/actions/workflows/tests.yml)
|
|
17
|
+
[](https://codecov.io/gh/guihcs/complex_evaluate)
|
|
18
|
+
|
|
19
|
+
A Python library for evaluating complex ontology alignments in [EDOAL](https://moex.gitlabpages.inria.fr/alignapi/edoal.html) (Expressive and Declarative Ontology Alignment Language) format adapting precision, recall, and f-measure metrics to the complex matching case.
|
|
20
|
+
|
|
21
|
+
### Requirements
|
|
22
|
+
|
|
23
|
+
- Python >= 3.9
|
|
24
|
+
- NumPy
|
|
25
|
+
- SciPy
|
|
26
|
+
|
|
27
|
+
## 📦 Installation
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
pip install complex_evaluate
|
|
31
|
+
```
|
|
32
|
+
## 📖 Usage
|
|
33
|
+
|
|
34
|
+
### Basic Example
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
from complex_evaluate.evaluate import evaluate_edoal
|
|
38
|
+
|
|
39
|
+
# Compare two alignment files
|
|
40
|
+
precision, recall, f_measure = evaluate_edoal(
|
|
41
|
+
'predicted_alignment.edoal',
|
|
42
|
+
'reference_alignment.edoal'
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
print(f"Precision: {precision:.3f}")
|
|
46
|
+
print(f"Recall: {recall:.3f}")
|
|
47
|
+
print(f"F-measure: {f_measure:.3f}")
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
### Comparing from strings
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
from complex_evaluate.evaluate import evaluate_edoal_string
|
|
54
|
+
|
|
55
|
+
predicted = '''<?xml version="1.0" encoding="UTF-8"?>
|
|
56
|
+
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
|
57
|
+
xmlns="http://knowledgeweb.semanticweb.org/heterogeneity/alignment#">
|
|
58
|
+
<Alignment>
|
|
59
|
+
<map>
|
|
60
|
+
<Cell>
|
|
61
|
+
<entity1>
|
|
62
|
+
<Class rdf:about="http://example.org#ClassA" />
|
|
63
|
+
</entity1>
|
|
64
|
+
<entity2>
|
|
65
|
+
<Class rdf:about="http://example.org#ClassB" />
|
|
66
|
+
</entity2>
|
|
67
|
+
</Cell>
|
|
68
|
+
</map>
|
|
69
|
+
</Alignment>
|
|
70
|
+
</rdf:RDF>'''
|
|
71
|
+
|
|
72
|
+
reference = predicted # Use same for identity test
|
|
73
|
+
|
|
74
|
+
p, r, f = evaluate_edoal_string(predicted, reference)
|
|
75
|
+
print(f"F-measure: {f}") # Should be 1.0 for identical alignments
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## 📊 Use Cases
|
|
79
|
+
|
|
80
|
+
This metric was used in the evaluation of OAEI 2025 in the Complex Matching track https://oaei.ontologymatching.org/2025/results/complex/index.html.
|
|
81
|
+
|
|
82
|
+
Also, this library is particularly useful for:
|
|
83
|
+
|
|
84
|
+
- **Ontology Alignment Evaluation**: Benchmarking alignment approaches on complex matching tasks.
|
|
85
|
+
- **LLM reasoning training**: The metric can enable the training of LLMs to reason about complex alignments, by providing a verifiable reward signal based on the score of the predicted alignment against a reference alignment.
|
|
86
|
+
|
|
87
|
+
## 🤝 Contributing
|
|
88
|
+
|
|
89
|
+
Contributions are welcome! Some areas for improvement:
|
|
90
|
+
- Additional similarity metrics.
|
|
91
|
+
- Performance optimizations.
|
|
92
|
+
- Support for other alignment formats.
|
|
93
|
+
- Extended documentation and examples.
|
|
94
|
+
|
|
95
|
+
## 📄 License
|
|
96
|
+
|
|
97
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
98
|
+
|
|
99
|
+
## 📚 Citation
|
|
100
|
+
|
|
101
|
+
If you use this library in your research, please cite it as follows:
|
|
102
|
+
|
|
103
|
+
```bibtex
|
|
104
|
+
@inproceedings{DBLP:conf/esws/SousaLS25,
|
|
105
|
+
author = {Guilherme Henrique Santos Sousa and
|
|
106
|
+
Rinaldo Lima and
|
|
107
|
+
C{\'{a}}ssia Trojahn dos Santos},
|
|
108
|
+
title = {On Evaluation Metrics for Complex Matching Based on Reference Alignments},
|
|
109
|
+
booktitle = {{ESWC} {(1)}},
|
|
110
|
+
series = {Lecture Notes in Computer Science},
|
|
111
|
+
volume = {15718},
|
|
112
|
+
pages = {77--93},
|
|
113
|
+
publisher = {Springer},
|
|
114
|
+
year = {2025}
|
|
115
|
+
}
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
---
|
|
119
|
+
|
|
120
|
+
*Built with ❤️ for the Semantic Web and Ontology Matching community.*
|
|
121
|
+
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
# Complex Evaluate
|
|
2
|
+
|
|
3
|
+
[](https://www.python.org/downloads/)
|
|
4
|
+
[](LICENSE)
|
|
5
|
+
[](https://github.com/guihcs/complex_evaluate/actions/workflows/tests.yml)
|
|
6
|
+
[](https://codecov.io/gh/guihcs/complex_evaluate)
|
|
7
|
+
|
|
8
|
+
A Python library for evaluating complex ontology alignments in [EDOAL](https://moex.gitlabpages.inria.fr/alignapi/edoal.html) (Expressive and Declarative Ontology Alignment Language) format adapting precision, recall, and f-measure metrics to the complex matching case.
|
|
9
|
+
|
|
10
|
+
### Requirements
|
|
11
|
+
|
|
12
|
+
- Python >= 3.9
|
|
13
|
+
- NumPy
|
|
14
|
+
- SciPy
|
|
15
|
+
|
|
16
|
+
## 📦 Installation
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install complex_evaluate
|
|
20
|
+
```
|
|
21
|
+
## 📖 Usage
|
|
22
|
+
|
|
23
|
+
### Basic Example
|
|
24
|
+
|
|
25
|
+
```python
|
|
26
|
+
from complex_evaluate.evaluate import evaluate_edoal
|
|
27
|
+
|
|
28
|
+
# Compare two alignment files
|
|
29
|
+
precision, recall, f_measure = evaluate_edoal(
|
|
30
|
+
'predicted_alignment.edoal',
|
|
31
|
+
'reference_alignment.edoal'
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
print(f"Precision: {precision:.3f}")
|
|
35
|
+
print(f"Recall: {recall:.3f}")
|
|
36
|
+
print(f"F-measure: {f_measure:.3f}")
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
### Comparing from strings
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from complex_evaluate.evaluate import evaluate_edoal_string
|
|
43
|
+
|
|
44
|
+
predicted = '''<?xml version="1.0" encoding="UTF-8"?>
|
|
45
|
+
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
|
46
|
+
xmlns="http://knowledgeweb.semanticweb.org/heterogeneity/alignment#">
|
|
47
|
+
<Alignment>
|
|
48
|
+
<map>
|
|
49
|
+
<Cell>
|
|
50
|
+
<entity1>
|
|
51
|
+
<Class rdf:about="http://example.org#ClassA" />
|
|
52
|
+
</entity1>
|
|
53
|
+
<entity2>
|
|
54
|
+
<Class rdf:about="http://example.org#ClassB" />
|
|
55
|
+
</entity2>
|
|
56
|
+
</Cell>
|
|
57
|
+
</map>
|
|
58
|
+
</Alignment>
|
|
59
|
+
</rdf:RDF>'''
|
|
60
|
+
|
|
61
|
+
reference = predicted # Use same for identity test
|
|
62
|
+
|
|
63
|
+
p, r, f = evaluate_edoal_string(predicted, reference)
|
|
64
|
+
print(f"F-measure: {f}") # Should be 1.0 for identical alignments
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## 📊 Use Cases
|
|
68
|
+
|
|
69
|
+
This metric was used in the evaluation of OAEI 2025 in the Complex Matching track https://oaei.ontologymatching.org/2025/results/complex/index.html.
|
|
70
|
+
|
|
71
|
+
Also, this library is particularly useful for:
|
|
72
|
+
|
|
73
|
+
- **Ontology Alignment Evaluation**: Benchmarking alignment approaches on complex matching tasks.
|
|
74
|
+
- **LLM reasoning training**: The metric can enable the training of LLMs to reason about complex alignments, by providing a verifiable reward signal based on the score of the predicted alignment against a reference alignment.
|
|
75
|
+
|
|
76
|
+
## 🤝 Contributing
|
|
77
|
+
|
|
78
|
+
Contributions are welcome! Some areas for improvement:
|
|
79
|
+
- Additional similarity metrics.
|
|
80
|
+
- Performance optimizations.
|
|
81
|
+
- Support for other alignment formats.
|
|
82
|
+
- Extended documentation and examples.
|
|
83
|
+
|
|
84
|
+
## 📄 License
|
|
85
|
+
|
|
86
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
87
|
+
|
|
88
|
+
## 📚 Citation
|
|
89
|
+
|
|
90
|
+
If you use this library in your research, please cite it as follows:
|
|
91
|
+
|
|
92
|
+
```bibtex
|
|
93
|
+
@inproceedings{DBLP:conf/esws/SousaLS25,
|
|
94
|
+
author = {Guilherme Henrique Santos Sousa and
|
|
95
|
+
Rinaldo Lima and
|
|
96
|
+
C{\'{a}}ssia Trojahn dos Santos},
|
|
97
|
+
title = {On Evaluation Metrics for Complex Matching Based on Reference Alignments},
|
|
98
|
+
booktitle = {{ESWC} {(1)}},
|
|
99
|
+
series = {Lecture Notes in Computer Science},
|
|
100
|
+
volume = {15718},
|
|
101
|
+
pages = {77--93},
|
|
102
|
+
publisher = {Springer},
|
|
103
|
+
year = {2025}
|
|
104
|
+
}
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
---
|
|
108
|
+
|
|
109
|
+
*Built with ❤️ for the Semantic Web and Ontology Matching community.*
|
|
110
|
+
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["uv_build >= 0.9.11, <0.10.0"]
|
|
3
|
+
build-backend = "uv_build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "complex_evaluate"
|
|
7
|
+
version = "0.0.1"
|
|
8
|
+
authors = [
|
|
9
|
+
{ name="Guilherme Henrique", email="guihss.cs@gmail.com" },
|
|
10
|
+
]
|
|
11
|
+
description = "Package to evaluate complex alignments."
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.9"
|
|
14
|
+
|
|
15
|
+
[project.urls]
|
|
16
|
+
Homepage = "https://github.com/guihcs/complex_evaluate"
|
|
17
|
+
Issues = "https://github.com/guihcs/complex_evaluate/issues"
|
|
File without changes
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
import xml.etree.ElementTree as ET
|
|
2
|
+
from complex_evaluate.uted import u_sim, tree_size, compute_cache
|
|
3
|
+
import re
|
|
4
|
+
import io
|
|
5
|
+
|
|
6
|
+
def xml_to_tree(t):
|
|
7
|
+
att_keys = sorted(list(t.attrib.keys()))
|
|
8
|
+
have_about = False
|
|
9
|
+
k = None
|
|
10
|
+
if '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about' in att_keys:
|
|
11
|
+
att_keys.remove('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about')
|
|
12
|
+
have_about = True
|
|
13
|
+
k = '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about'
|
|
14
|
+
att_pairs = ', '.join([f'{k}: {t.attrib[k]}' for k in att_keys])
|
|
15
|
+
tag = f'{t.tag} {att_pairs}'
|
|
16
|
+
|
|
17
|
+
children = sorted([xml_to_tree(c) for c in t], key=lambda x: str(x))
|
|
18
|
+
if have_about:
|
|
19
|
+
children.append((f'{k}{t.attrib[k]}', []))
|
|
20
|
+
|
|
21
|
+
return tag, children
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def load_maps_string(path, ignore_errors = False):
|
|
25
|
+
root = ET.fromstring(path)
|
|
26
|
+
|
|
27
|
+
maps = []
|
|
28
|
+
|
|
29
|
+
for c in root:
|
|
30
|
+
for c1 in c:
|
|
31
|
+
if not c1.tag.endswith('map'):
|
|
32
|
+
continue
|
|
33
|
+
if ignore_errors:
|
|
34
|
+
try:
|
|
35
|
+
cell = c1.find('{http://knowledgeweb.semanticweb.org/heterogeneity/alignment#}Cell')
|
|
36
|
+
ent1 = cell.find('{http://knowledgeweb.semanticweb.org/heterogeneity/alignment#}entity1')
|
|
37
|
+
ent2 = cell.find('{http://knowledgeweb.semanticweb.org/heterogeneity/alignment#}entity2')
|
|
38
|
+
|
|
39
|
+
t1 = ent1[0] if len(ent1) > 0 else ent1
|
|
40
|
+
t2 = ent2[0] if len(ent2) > 0 else ent2
|
|
41
|
+
|
|
42
|
+
maps.append((xml_to_tree(t1), xml_to_tree(t2)))
|
|
43
|
+
except:
|
|
44
|
+
pass
|
|
45
|
+
else:
|
|
46
|
+
cell = c1.find('{http://knowledgeweb.semanticweb.org/heterogeneity/alignment#}Cell')
|
|
47
|
+
ent1 = cell.find('{http://knowledgeweb.semanticweb.org/heterogeneity/alignment#}entity1')
|
|
48
|
+
ent2 = cell.find('{http://knowledgeweb.semanticweb.org/heterogeneity/alignment#}entity2')
|
|
49
|
+
|
|
50
|
+
t1 = ent1[0] if len(ent1) > 0 else ent1
|
|
51
|
+
t2 = ent2[0] if len(ent2) > 0 else ent2
|
|
52
|
+
|
|
53
|
+
maps.append((xml_to_tree(t1), xml_to_tree(t2)))
|
|
54
|
+
|
|
55
|
+
return maps
|
|
56
|
+
|
|
57
|
+
def load_maps(path, ignore_errors = False):
|
|
58
|
+
tree = ET.parse(path)
|
|
59
|
+
root = tree.getroot()
|
|
60
|
+
|
|
61
|
+
maps = []
|
|
62
|
+
|
|
63
|
+
for c in root:
|
|
64
|
+
for c1 in c:
|
|
65
|
+
if not c1.tag.endswith('map'):
|
|
66
|
+
continue
|
|
67
|
+
if ignore_errors:
|
|
68
|
+
try:
|
|
69
|
+
cell = c1.find('{http://knowledgeweb.semanticweb.org/heterogeneity/alignment#}Cell')
|
|
70
|
+
ent1 = cell.find('{http://knowledgeweb.semanticweb.org/heterogeneity/alignment#}entity1')
|
|
71
|
+
ent2 = cell.find('{http://knowledgeweb.semanticweb.org/heterogeneity/alignment#}entity2')
|
|
72
|
+
|
|
73
|
+
t1 = ent1[0] if len(ent1) > 0 else ent1
|
|
74
|
+
t2 = ent2[0] if len(ent2) > 0 else ent2
|
|
75
|
+
|
|
76
|
+
maps.append((xml_to_tree(t1), xml_to_tree(t2)))
|
|
77
|
+
except:
|
|
78
|
+
pass
|
|
79
|
+
else:
|
|
80
|
+
cell = c1.find('{http://knowledgeweb.semanticweb.org/heterogeneity/alignment#}Cell')
|
|
81
|
+
ent1 = cell.find('{http://knowledgeweb.semanticweb.org/heterogeneity/alignment#}entity1')
|
|
82
|
+
ent2 = cell.find('{http://knowledgeweb.semanticweb.org/heterogeneity/alignment#}entity2')
|
|
83
|
+
|
|
84
|
+
t1 = ent1[0] if len(ent1) > 0 else ent1
|
|
85
|
+
t2 = ent2[0] if len(ent2) > 0 else ent2
|
|
86
|
+
|
|
87
|
+
maps.append((xml_to_tree(t1), xml_to_tree(t2)))
|
|
88
|
+
|
|
89
|
+
return maps
|
|
90
|
+
|
|
91
|
+
def maximize_assign(m):
|
|
92
|
+
|
|
93
|
+
preferences = {}
|
|
94
|
+
|
|
95
|
+
for i, p in enumerate(m):
|
|
96
|
+
preferences[i] = list(sorted(enumerate(p), key=lambda x: x[1], reverse=True))
|
|
97
|
+
|
|
98
|
+
unassigned_pairs = list(range(len(m)))
|
|
99
|
+
|
|
100
|
+
assigned_pairs = {}
|
|
101
|
+
|
|
102
|
+
while unassigned_pairs:
|
|
103
|
+
pair = unassigned_pairs.pop()
|
|
104
|
+
pair_similarities = preferences[pair]
|
|
105
|
+
if len(pair_similarities) == 0:
|
|
106
|
+
continue
|
|
107
|
+
|
|
108
|
+
next_similarity = pair_similarities.pop(0)
|
|
109
|
+
|
|
110
|
+
if next_similarity[0] in assigned_pairs:
|
|
111
|
+
if next_similarity[1] > assigned_pairs[next_similarity[0]][1]:
|
|
112
|
+
unassigned_pairs.append(assigned_pairs[next_similarity[0]][0])
|
|
113
|
+
assigned_pairs[next_similarity[0]] = (pair, next_similarity[1])
|
|
114
|
+
else:
|
|
115
|
+
unassigned_pairs.append(pair)
|
|
116
|
+
else:
|
|
117
|
+
assigned_pairs[next_similarity[0]] = (pair, next_similarity[1])
|
|
118
|
+
|
|
119
|
+
return assigned_pairs
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def evaluate_edoal_string(p1, p2, w = 0.5, sim_func = u_sim, ignore_errors = False, soft=False):
|
|
123
|
+
return evaluate_edoal(io.StringIO(initial_value=p1), io.StringIO(initial_value=p2), w, sim_func, ignore_errors=ignore_errors, soft=soft)
|
|
124
|
+
|
|
125
|
+
def evaluate_edoal(p1, p2, w = 0.5, sim_func = u_sim, ignore_errors = False, soft=False):
|
|
126
|
+
maps1 = load_maps(p1, ignore_errors=ignore_errors)
|
|
127
|
+
maps2 = load_maps(p2, ignore_errors=ignore_errors)
|
|
128
|
+
|
|
129
|
+
p_simple_count = 0
|
|
130
|
+
p_complex_count = 0
|
|
131
|
+
for m1, m2 in maps1:
|
|
132
|
+
if tree_size(m1) == 1 and tree_size(m2) == 1:
|
|
133
|
+
p_simple_count += 1
|
|
134
|
+
else:
|
|
135
|
+
p_complex_count += 1
|
|
136
|
+
|
|
137
|
+
r_simple_count = 0
|
|
138
|
+
r_complex_count = 0
|
|
139
|
+
for m1, m2 in maps2:
|
|
140
|
+
if tree_size(m1) == 1 and tree_size(m2) == 1:
|
|
141
|
+
r_simple_count += 1
|
|
142
|
+
else:
|
|
143
|
+
r_complex_count += 1
|
|
144
|
+
|
|
145
|
+
sl = []
|
|
146
|
+
|
|
147
|
+
cs1 = []
|
|
148
|
+
|
|
149
|
+
for mt1, mt2 in maps1:
|
|
150
|
+
cs1.append((compute_cache(mt1), compute_cache(mt2)))
|
|
151
|
+
|
|
152
|
+
cs2 = []
|
|
153
|
+
for m1, m2 in maps2:
|
|
154
|
+
cs2.append((compute_cache(m1), compute_cache(m2)))
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
for (mt1, mt2), (c1, c2) in zip(maps1, cs1):
|
|
158
|
+
ms = []
|
|
159
|
+
for (m1, m2), (cc1, cc2) in zip(maps2, cs2):
|
|
160
|
+
ms.append((sim_func(mt1, m1, cache=[*c1, *cc1]) + sim_func(mt2, m2, cache=[*c2, *cc2])) / 2)
|
|
161
|
+
|
|
162
|
+
sl.append(ms)
|
|
163
|
+
|
|
164
|
+
assigns = maximize_assign(sl)
|
|
165
|
+
|
|
166
|
+
p_simple_assigns = {}
|
|
167
|
+
p_complex_assigns = {}
|
|
168
|
+
r_simple_assigns = {}
|
|
169
|
+
r_complex_assigns = {}
|
|
170
|
+
|
|
171
|
+
for k, v in assigns.items():
|
|
172
|
+
mt1, mt2 = maps1[v[0]]
|
|
173
|
+
m1, m2 = maps2[k]
|
|
174
|
+
if tree_size(m1) == 1 and tree_size(m2) == 1:
|
|
175
|
+
r_simple_assigns[k] = v
|
|
176
|
+
else:
|
|
177
|
+
r_complex_assigns[k] = v
|
|
178
|
+
|
|
179
|
+
if tree_size(mt1) == 1 and tree_size(mt2) == 1:
|
|
180
|
+
p_simple_assigns[k] = v
|
|
181
|
+
else:
|
|
182
|
+
p_complex_assigns[k] = v
|
|
183
|
+
|
|
184
|
+
sum1 = sum([v[1] for k, v in r_simple_assigns.items()])
|
|
185
|
+
sum2 = sum([v[1] for k, v in r_complex_assigns.items()])
|
|
186
|
+
sum3 = sum([v[1] for k, v in p_simple_assigns.items()])
|
|
187
|
+
sum4 = sum([v[1] for k, v in p_complex_assigns.items()])
|
|
188
|
+
|
|
189
|
+
rdiv = (1 - w) * r_simple_count + w * r_complex_count
|
|
190
|
+
pdiv = (1 - w) * p_simple_count + w * p_complex_count
|
|
191
|
+
soft_recall = ((1 - w) * sum1 + w * sum2) / rdiv if rdiv > 0 else 0
|
|
192
|
+
soft_precision = ((1 - w) * sum3 + w * sum4) / pdiv if pdiv > 0 else 0
|
|
193
|
+
soft_fmeasure = 2 * soft_recall * soft_precision / (
|
|
194
|
+
soft_recall + soft_precision) if soft_recall + soft_precision > 0 else 0
|
|
195
|
+
return soft_precision, soft_recall, soft_fmeasure
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def filter_entities(n):
|
|
199
|
+
return [x.split(' ')[1] for x in re.findall(r'resource: http://[^\s]+', n)] + [x.split(' ')[1] for x in re.findall(r'about: http://[^\s]+', n)]
|
|
200
|
+
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
import heapq
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from scipy.optimize import linear_sum_assignment
|
|
5
|
+
|
|
6
|
+
def dfs(t):
|
|
7
|
+
yield t
|
|
8
|
+
for c in t[1]:
|
|
9
|
+
yield from dfs(c)
|
|
10
|
+
|
|
11
|
+
def tree_size(t):
|
|
12
|
+
return len(list(dfs(t)))
|
|
13
|
+
|
|
14
|
+
def sc(a, b, x, y):
|
|
15
|
+
if a == 0 or b == 0:
|
|
16
|
+
return 1
|
|
17
|
+
elif x[a-1][0] == y[b-1][0]:
|
|
18
|
+
return 0
|
|
19
|
+
else:
|
|
20
|
+
return 0.8
|
|
21
|
+
|
|
22
|
+
def c(M, x, y):
|
|
23
|
+
s = 0
|
|
24
|
+
for m in M:
|
|
25
|
+
if 0 in m:
|
|
26
|
+
s += 1
|
|
27
|
+
elif x[m[0]-1][0] != y[m[1]-1][0]:
|
|
28
|
+
s += 0.8
|
|
29
|
+
return s
|
|
30
|
+
|
|
31
|
+
def heuristic(m, n, x, y):
|
|
32
|
+
len_m = len(m)
|
|
33
|
+
len_n = len(n)
|
|
34
|
+
|
|
35
|
+
if len_m == 0:
|
|
36
|
+
return len_n
|
|
37
|
+
if len_n == 0:
|
|
38
|
+
return len_m
|
|
39
|
+
|
|
40
|
+
cm = np.ones((len_m + 1, len_n + 1), dtype=float)
|
|
41
|
+
cm[-1, -1] = 0
|
|
42
|
+
|
|
43
|
+
for i, mi in enumerate(m):
|
|
44
|
+
for j, nj in enumerate(n):
|
|
45
|
+
cm[i, j] = sc(mi, nj, x, y)
|
|
46
|
+
|
|
47
|
+
row_ind, col_ind = linear_sum_assignment(cm)
|
|
48
|
+
return cm[row_ind, col_ind].sum()
|
|
49
|
+
|
|
50
|
+
def I(M):
|
|
51
|
+
return {x[0] for x in M}
|
|
52
|
+
|
|
53
|
+
def J(M):
|
|
54
|
+
return {x[1] for x in M}
|
|
55
|
+
|
|
56
|
+
def descendants(e, x):
|
|
57
|
+
d = set()
|
|
58
|
+
for c in x[e-1][1]:
|
|
59
|
+
d.add(x.index(c) + 1)
|
|
60
|
+
d.update(descendants(x.index(c) + 1, x))
|
|
61
|
+
return d
|
|
62
|
+
|
|
63
|
+
def get_parent(e, t):
|
|
64
|
+
for x in dfs(t):
|
|
65
|
+
for c in x[1]:
|
|
66
|
+
if c == e:
|
|
67
|
+
return x
|
|
68
|
+
|
|
69
|
+
def get_map(i, m, s):
|
|
70
|
+
for x in m:
|
|
71
|
+
if x[s] == i:
|
|
72
|
+
return x
|
|
73
|
+
def get_index(e, x):
|
|
74
|
+
for i in range(len(x)):
|
|
75
|
+
if x[i] == e:
|
|
76
|
+
return i + 1
|
|
77
|
+
|
|
78
|
+
def largest_ancestor(i, M, pm):
|
|
79
|
+
p = pm[i-1]
|
|
80
|
+
m = get_map(p, M, 0)
|
|
81
|
+
if m[1] != 0:
|
|
82
|
+
return m
|
|
83
|
+
return largest_ancestor(p, M, pm)
|
|
84
|
+
|
|
85
|
+
def ancestor_path(s, p, pm2):
|
|
86
|
+
path = []
|
|
87
|
+
while True:
|
|
88
|
+
n = pm2[s-1]
|
|
89
|
+
if n is None:
|
|
90
|
+
break
|
|
91
|
+
if n == p:
|
|
92
|
+
break
|
|
93
|
+
path.append(n)
|
|
94
|
+
s = n
|
|
95
|
+
return path
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def compute_cache(t1):
|
|
99
|
+
x = list(dfs(t1))
|
|
100
|
+
dx = [descendants(q, x) for q in range(1, len(x) + 1)]
|
|
101
|
+
pm1 = [get_index(get_parent(q, t1), x) for q in x]
|
|
102
|
+
return x, dx, pm1
|
|
103
|
+
|
|
104
|
+
def u_ted(t1, t2, cache=None):
|
|
105
|
+
|
|
106
|
+
hm = {}
|
|
107
|
+
|
|
108
|
+
def h(m, n, x, y):
|
|
109
|
+
ky = (frozenset(m), frozenset(n))
|
|
110
|
+
if ky not in hm:
|
|
111
|
+
hm[ky] = heuristic(m, n, x, y)
|
|
112
|
+
return hm[ky]
|
|
113
|
+
|
|
114
|
+
t1 = (0, [t1])
|
|
115
|
+
t2 = (0, [t2])
|
|
116
|
+
|
|
117
|
+
if cache is None:
|
|
118
|
+
x, dx, pm1 = compute_cache(t1)
|
|
119
|
+
y, dy, pm2 = compute_cache(t2)
|
|
120
|
+
else:
|
|
121
|
+
x, dx, pm1, y, dy, pm2 = cache
|
|
122
|
+
|
|
123
|
+
M = {(1, 1)}
|
|
124
|
+
Q = [(c(M, x, y) + h(set(range(1, len(x) + 1)), set(range(1, len(y) + 1)), x, y), M)]
|
|
125
|
+
|
|
126
|
+
while Q:
|
|
127
|
+
ch, M = heapq.heappop(Q)
|
|
128
|
+
i = min(set(range(1, len(x) + 2)) - I(M))
|
|
129
|
+
|
|
130
|
+
if i == len(x) + 1:
|
|
131
|
+
fm = M.union({(0, j) for j in set(range(1, len(y) + 1)) - J(M)})
|
|
132
|
+
return c(fm, x, y), fm
|
|
133
|
+
|
|
134
|
+
k, l = largest_ancestor(i, M, pm1)
|
|
135
|
+
|
|
136
|
+
hp = h(set(range(1, len(x) + 1)) - dx[k-1].union(I(M)), set(range(1, len(y) + 1)) - dy[l-1].union(J(M)), x, y)
|
|
137
|
+
|
|
138
|
+
M0 = M.union({(i, 0)})
|
|
139
|
+
h0 = h(dx[k-1] - I(M0), dy[l-1] - J(M0), x, y) + hp
|
|
140
|
+
nm = [(M0, h0)]
|
|
141
|
+
|
|
142
|
+
for j in dy[l-1] - J(M):
|
|
143
|
+
path = [p for p in ancestor_path(j, l, pm2)]
|
|
144
|
+
Mj = M.union({(i, j)}.union({(0, q) for q in path}))
|
|
145
|
+
hj = h(dx[i-1] - I(Mj), dy[j-1] - J(Mj), x, y) + h(dx[k-1] - dx[i-1].union(I(Mj)), dy[l-1] - dy[j-1].union(J(Mj)), x, y) + hp
|
|
146
|
+
nm.append((Mj, hj))
|
|
147
|
+
|
|
148
|
+
for mn, hn in nm:
|
|
149
|
+
heapq.heappush(Q, (c(mn, x, y) + hn, mn))
|
|
150
|
+
|
|
151
|
+
def u_sim(t1, t2, cache=None):
|
|
152
|
+
max_size = max(tree_size(t1), tree_size(t2))
|
|
153
|
+
if max_size == 0:
|
|
154
|
+
return 1.0
|
|
155
|
+
|
|
156
|
+
dist, _ = u_ted(t1, t2, cache)
|
|
157
|
+
|
|
158
|
+
return 1 - dist / max_size
|