cbrkit 0.3.1__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cbrkit-0.3.1 → cbrkit-0.4.0}/PKG-INFO +30 -3
- {cbrkit-0.3.1 → cbrkit-0.4.0}/README.md +29 -2
- {cbrkit-0.3.1 → cbrkit-0.4.0}/cbrkit/__init__.py +5 -3
- {cbrkit-0.3.1 → cbrkit-0.4.0}/cbrkit/global_sim/_aggregate.py +1 -1
- {cbrkit-0.3.1 → cbrkit-0.4.0}/cbrkit/global_sim/_attribute_value.py +1 -1
- {cbrkit-0.3.1 → cbrkit-0.4.0}/cbrkit/global_sim/graph/_astar.py +1 -1
- cbrkit-0.3.1/cbrkit/sim/_helpers.py → cbrkit-0.4.0/cbrkit/helpers.py +33 -1
- {cbrkit-0.3.1 → cbrkit-0.4.0}/cbrkit/retrieval.py +1 -1
- cbrkit-0.4.0/cbrkit/sim/__init__.py +9 -0
- {cbrkit-0.3.1 → cbrkit-0.4.0}/cbrkit/sim/collections.py +1 -1
- {cbrkit-0.3.1 → cbrkit-0.4.0}/cbrkit/sim/numeric.py +8 -3
- cbrkit-0.4.0/cbrkit/sim/taxonomy.py +328 -0
- {cbrkit-0.3.1 → cbrkit-0.4.0}/pyproject.toml +1 -1
- cbrkit-0.3.1/cbrkit/sim/__init__.py +0 -16
- cbrkit-0.3.1/cbrkit/sim/taxonomy.py +0 -118
- {cbrkit-0.3.1 → cbrkit-0.4.0}/LICENSE +0 -0
- {cbrkit-0.3.1 → cbrkit-0.4.0}/cbrkit/__main__.py +0 -0
- {cbrkit-0.3.1 → cbrkit-0.4.0}/cbrkit/api.py +0 -0
- {cbrkit-0.3.1 → cbrkit-0.4.0}/cbrkit/cli.py +0 -0
- {cbrkit-0.3.1 → cbrkit-0.4.0}/cbrkit/global_sim/__init__.py +0 -0
- {cbrkit-0.3.1 → cbrkit-0.4.0}/cbrkit/global_sim/graph/__init__.py +0 -0
- {cbrkit-0.3.1 → cbrkit-0.4.0}/cbrkit/global_sim/graph/_model.py +0 -0
- {cbrkit-0.3.1 → cbrkit-0.4.0}/cbrkit/loaders.py +0 -0
- {cbrkit-0.3.1 → cbrkit-0.4.0}/cbrkit/py.typed +0 -0
- {cbrkit-0.3.1 → cbrkit-0.4.0}/cbrkit/sim/generic.py +0 -0
- {cbrkit-0.3.1 → cbrkit-0.4.0}/cbrkit/sim/strings.py +0 -0
- {cbrkit-0.3.1 → cbrkit-0.4.0}/cbrkit/typing.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: cbrkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI.
|
|
5
5
|
Home-page: https://wi2trier.github.io/cbrkit/
|
|
6
6
|
License: MIT
|
|
@@ -104,7 +104,7 @@ The following modules are part of CBRkit:
|
|
|
104
104
|
CBRkit is fully typed, so IDEs like VSCode and PyCharm can provide autocompletion and type checking.
|
|
105
105
|
We will explain all modules and their basic usage in the following sections.
|
|
106
106
|
|
|
107
|
-
### Loading Cases
|
|
107
|
+
### Loading Cases and Queries
|
|
108
108
|
|
|
109
109
|
The first step is to load cases and queries.
|
|
110
110
|
We provide predefined functions for the most common formats like CSV, JSON, and XML.
|
|
@@ -119,9 +119,36 @@ df = pd.read_csv("path/to/cases.csv")
|
|
|
119
119
|
cases = cbrkit.loaders.dataframe(df)
|
|
120
120
|
```
|
|
121
121
|
|
|
122
|
-
|
|
122
|
+
When dealing with formats like JSON, the files can be loaded directly:
|
|
123
123
|
|
|
124
124
|
```python
|
|
125
|
+
cases = cbrkit.loaders.json("path/to/cases.json")
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
Queries can either be loaded using the same loader functions.
|
|
129
|
+
CBRkit expects the type of the queries to match the type of the cases.
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
# for pandas
|
|
125
133
|
queries = cbrkit.loaders.dataframe(pd.read_csv("path/to/queries.csv"))
|
|
134
|
+
# for json
|
|
135
|
+
queries = cbrkit.loaders.json("path/to/queries.json")
|
|
126
136
|
```
|
|
127
137
|
|
|
138
|
+
In case your query collection only contains a single entry, you can use the `singleton` function to extract it.
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
query = cbrkit.helpers.singleton(queries)
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
Alternatively, you can also create a query directly in Python:
|
|
145
|
+
|
|
146
|
+
```python
|
|
147
|
+
# for pandas
|
|
148
|
+
query = pd.Series({"name": "John", "age": 25})
|
|
149
|
+
# for json
|
|
150
|
+
query = {"name": "John", "age": 25}
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
### Similarity Measures and Aggregation
|
|
154
|
+
|
|
@@ -55,7 +55,7 @@ The following modules are part of CBRkit:
|
|
|
55
55
|
CBRkit is fully typed, so IDEs like VSCode and PyCharm can provide autocompletion and type checking.
|
|
56
56
|
We will explain all modules and their basic usage in the following sections.
|
|
57
57
|
|
|
58
|
-
### Loading Cases
|
|
58
|
+
### Loading Cases and Queries
|
|
59
59
|
|
|
60
60
|
The first step is to load cases and queries.
|
|
61
61
|
We provide predefined functions for the most common formats like CSV, JSON, and XML.
|
|
@@ -70,8 +70,35 @@ df = pd.read_csv("path/to/cases.csv")
|
|
|
70
70
|
cases = cbrkit.loaders.dataframe(df)
|
|
71
71
|
```
|
|
72
72
|
|
|
73
|
-
|
|
73
|
+
When dealing with formats like JSON, the files can be loaded directly:
|
|
74
74
|
|
|
75
75
|
```python
|
|
76
|
+
cases = cbrkit.loaders.json("path/to/cases.json")
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
Queries can either be loaded using the same loader functions.
|
|
80
|
+
CBRkit expects the type of the queries to match the type of the cases.
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
# for pandas
|
|
76
84
|
queries = cbrkit.loaders.dataframe(pd.read_csv("path/to/queries.csv"))
|
|
85
|
+
# for json
|
|
86
|
+
queries = cbrkit.loaders.json("path/to/queries.json")
|
|
77
87
|
```
|
|
88
|
+
|
|
89
|
+
In case your query collection only contains a single entry, you can use the `singleton` function to extract it.
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
query = cbrkit.helpers.singleton(queries)
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Alternatively, you can also create a query directly in Python:
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
# for pandas
|
|
99
|
+
query = pd.Series({"name": "John", "age": 25})
|
|
100
|
+
# for json
|
|
101
|
+
query = {"name": "John", "age": 25}
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### Similarity Measures and Aggregation
|
|
@@ -5,12 +5,14 @@
|
|
|
5
5
|
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
-
from . import global_sim, loaders, retrieval, sim, typing
|
|
9
8
|
|
|
10
|
-
|
|
9
|
+
from . import global_sim, helpers, loaders, retrieval, sim, typing
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
11
12
|
"loaders",
|
|
12
13
|
"sim",
|
|
13
14
|
"global_sim",
|
|
14
15
|
"typing",
|
|
15
16
|
"retrieval",
|
|
16
|
-
|
|
17
|
+
"helpers",
|
|
18
|
+
]
|
|
@@ -16,7 +16,7 @@ from cbrkit.global_sim.graph._model import (
|
|
|
16
16
|
NodeData,
|
|
17
17
|
NodeKey,
|
|
18
18
|
)
|
|
19
|
-
from cbrkit.
|
|
19
|
+
from cbrkit.helpers import unpack_sims
|
|
20
20
|
from cbrkit.typing import Casebase, FloatProtocol, KeyType, SimPairFunc, SimType
|
|
21
21
|
|
|
22
22
|
logger = logging.getLogger(__name__)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from abc import ABC
|
|
2
|
-
from collections.abc import Iterable, Mapping, Sequence
|
|
2
|
+
from collections.abc import Collection, Iterable, Mapping, Sequence
|
|
3
3
|
from inspect import signature as inspect_signature
|
|
4
4
|
from typing import Any, cast
|
|
5
5
|
|
|
@@ -22,9 +22,41 @@ __all__ = [
|
|
|
22
22
|
"unpack_sim",
|
|
23
23
|
"unpack_sims",
|
|
24
24
|
"AbstractFloat",
|
|
25
|
+
"singleton",
|
|
25
26
|
]
|
|
26
27
|
|
|
27
28
|
|
|
29
|
+
def singleton(x: Mapping[Any, ValueType] | Collection[ValueType]) -> ValueType:
|
|
30
|
+
"""
|
|
31
|
+
Return the only element of the input, or raise an error if there are multiple elements.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
x: The input collection or mapping.
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
The only element of the input.
|
|
38
|
+
|
|
39
|
+
Examples:
|
|
40
|
+
>>> singleton([1])
|
|
41
|
+
1
|
|
42
|
+
>>> singleton({1: "a"})
|
|
43
|
+
'a'
|
|
44
|
+
|
|
45
|
+
Raises:
|
|
46
|
+
ValueError: If the input has more than one element.
|
|
47
|
+
TypeError: If the input is not a collection or mapping.
|
|
48
|
+
"""
|
|
49
|
+
if len(x) != 1:
|
|
50
|
+
raise ValueError(f"Expected exactly one element, but got {len(x)}")
|
|
51
|
+
|
|
52
|
+
if isinstance(x, Mapping):
|
|
53
|
+
return next(iter(x.values()))
|
|
54
|
+
elif isinstance(x, Collection):
|
|
55
|
+
return next(iter(x))
|
|
56
|
+
else:
|
|
57
|
+
raise TypeError(f"Expected a Mapping or Collection, but got {type(x)}")
|
|
58
|
+
|
|
59
|
+
|
|
28
60
|
def dist2sim(distance: float) -> float:
|
|
29
61
|
"""Convert a distance to a similarity.
|
|
30
62
|
|
|
@@ -2,8 +2,8 @@ from collections.abc import Callable, Collection, Mapping, Sequence
|
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
from typing import Any, Generic
|
|
4
4
|
|
|
5
|
+
from cbrkit.helpers import sim2map, unpack_sim
|
|
5
6
|
from cbrkit.loaders import python as load_python
|
|
6
|
-
from cbrkit.sim._helpers import sim2map, unpack_sim
|
|
7
7
|
from cbrkit.typing import (
|
|
8
8
|
AnySimFunc,
|
|
9
9
|
Casebase,
|
|
@@ -15,9 +15,11 @@ def linear(max: float, min: float = 0.0) -> SimPairFunc[Number, float]:
|
|
|
15
15
|
min: Minimum bound of the interval
|
|
16
16
|
|
|
17
17
|

|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
18
|
+
|
|
19
|
+
Examples:
|
|
20
|
+
>>> sim = linear(100)
|
|
21
|
+
>>> sim(50, 60)
|
|
22
|
+
0.9
|
|
21
23
|
"""
|
|
22
24
|
|
|
23
25
|
def wrapped_func(x: Number, y: Number) -> float:
|
|
@@ -40,6 +42,7 @@ def threshold(threshold: float) -> SimPairFunc[Number, float]:
|
|
|
40
42
|
threshold: If the absolute difference between the two values is less than or equal to this value, the similarity is 1.0, otherwise it is 0.0
|
|
41
43
|
|
|
42
44
|

|
|
45
|
+
|
|
43
46
|
Examples:
|
|
44
47
|
>>> sim = threshold(10)
|
|
45
48
|
>>> sim(50, 60)
|
|
@@ -61,6 +64,7 @@ def exponential(alpha: float = 1.0) -> SimPairFunc[Number, float]:
|
|
|
61
64
|
alpha: Controls the growth of the exponential function for the similarity. The larger alpha is, the faster the similarity decreases.
|
|
62
65
|
|
|
63
66
|

|
|
67
|
+
|
|
64
68
|
Examples:
|
|
65
69
|
>>> sim = exponential(0.1)
|
|
66
70
|
>>> sim(50, 60)
|
|
@@ -81,6 +85,7 @@ def sigmoid(alpha: float = 1.0, theta: float = 1.0) -> SimPairFunc[Number, float
|
|
|
81
85
|
theta: Specifies the point at which the similarity value is 0.5.
|
|
82
86
|
|
|
83
87
|

|
|
88
|
+
|
|
84
89
|
Examples:
|
|
85
90
|
>>> sim = sigmoid(1, 10)
|
|
86
91
|
>>> sim(50, 60)
|
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Literal, Optional, Protocol, TypedDict, cast
|
|
3
|
+
|
|
4
|
+
from cbrkit.loaders import data as load_data
|
|
5
|
+
from cbrkit.typing import FilePath, SimPairFunc
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"Taxonomy",
|
|
9
|
+
"TaxonomyNode",
|
|
10
|
+
"TaxonomyFunc",
|
|
11
|
+
"TaxonomyStrategy",
|
|
12
|
+
"load",
|
|
13
|
+
"wu_palmer",
|
|
14
|
+
"user_weights",
|
|
15
|
+
"auto_weights",
|
|
16
|
+
"node_levels",
|
|
17
|
+
"path_steps",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class SerializedNode(TypedDict, total=False):
|
|
22
|
+
name: str
|
|
23
|
+
weight: float
|
|
24
|
+
children: list["SerializedNode | str"]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass(slots=True)
|
|
28
|
+
class TaxonomyNode:
|
|
29
|
+
name: str
|
|
30
|
+
weight: float
|
|
31
|
+
depth: int
|
|
32
|
+
parent: Optional["TaxonomyNode"]
|
|
33
|
+
children: dict[str, "TaxonomyNode"] = field(default_factory=dict)
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def level(self) -> int:
|
|
37
|
+
return self.depth + 1
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class Taxonomy:
|
|
41
|
+
__slots__ = ("root", "nodes")
|
|
42
|
+
|
|
43
|
+
root: TaxonomyNode
|
|
44
|
+
nodes: dict[str, TaxonomyNode]
|
|
45
|
+
|
|
46
|
+
def __init__(self, path: FilePath) -> None:
|
|
47
|
+
root_data = cast(SerializedNode, load_data(path))
|
|
48
|
+
self.nodes = {}
|
|
49
|
+
self.root = self._load(root_data)
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def max_depth(self) -> int:
|
|
53
|
+
return max(node.depth for node in self.nodes.values())
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def max_level(self) -> int:
|
|
57
|
+
return max(node.level for node in self.nodes.values())
|
|
58
|
+
|
|
59
|
+
def _load(
|
|
60
|
+
self,
|
|
61
|
+
data: SerializedNode | str,
|
|
62
|
+
parent: TaxonomyNode | None = None,
|
|
63
|
+
depth: int = 0,
|
|
64
|
+
) -> TaxonomyNode:
|
|
65
|
+
if isinstance(data, str):
|
|
66
|
+
data = {"name": data}
|
|
67
|
+
|
|
68
|
+
assert "name" in data, "Missing name in some node"
|
|
69
|
+
|
|
70
|
+
node = TaxonomyNode(
|
|
71
|
+
name=data["name"],
|
|
72
|
+
weight=data.get("weight", 1.0),
|
|
73
|
+
depth=depth,
|
|
74
|
+
parent=parent,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
for child in data.get("children", []):
|
|
78
|
+
child_node = self._load(child, node, depth + 1)
|
|
79
|
+
node.children[child_node.name] = child_node
|
|
80
|
+
|
|
81
|
+
self.nodes[node.name] = node
|
|
82
|
+
|
|
83
|
+
return node
|
|
84
|
+
|
|
85
|
+
def lca(self, node1: TaxonomyNode, node2: TaxonomyNode) -> TaxonomyNode:
|
|
86
|
+
while node1 != node2:
|
|
87
|
+
if node1.parent is None or node2.parent is None:
|
|
88
|
+
return self.root
|
|
89
|
+
|
|
90
|
+
if node1.depth > node2.depth:
|
|
91
|
+
node1 = node1.parent
|
|
92
|
+
else:
|
|
93
|
+
node2 = node2.parent
|
|
94
|
+
|
|
95
|
+
return node1
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class TaxonomyFunc(Protocol):
|
|
99
|
+
def __call__(self, taxonomy: Taxonomy, x: str, y: str) -> float:
|
|
100
|
+
...
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
TaxonomyStrategy = Literal["optimistic", "pessimistic", "average"]
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def wu_palmer() -> TaxonomyFunc:
|
|
107
|
+
"""Wu & Palmer similarity measure of two nodes in a taxonomy.
|
|
108
|
+
|
|
109
|
+
Examples:
|
|
110
|
+
>>> taxonomy = Taxonomy("./data/cars-taxonomy.yaml")
|
|
111
|
+
>>> sim = wu_palmer()
|
|
112
|
+
>>> sim(taxonomy, "audi", "porsche")
|
|
113
|
+
0.5
|
|
114
|
+
>>> sim(taxonomy, "audi", "bmw")
|
|
115
|
+
0.0
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
def wrapped_func(taxonomy: Taxonomy, x: str, y: str) -> float:
|
|
119
|
+
node1 = taxonomy.nodes[x]
|
|
120
|
+
node2 = taxonomy.nodes[y]
|
|
121
|
+
lca = taxonomy.lca(node1, node2)
|
|
122
|
+
|
|
123
|
+
return (2 * lca.depth) / (node1.depth + node2.depth)
|
|
124
|
+
|
|
125
|
+
return wrapped_func
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def user_weights(strategy: TaxonomyStrategy) -> TaxonomyFunc:
|
|
129
|
+
"""User-defined weights similarity measure of two nodes in a taxonomy.
|
|
130
|
+
|
|
131
|
+
The weights are defined by the user in the taxonomy file.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
strategy: The strategy to use in case one of the node is the lowest common ancestor (lca).
|
|
135
|
+
One of "optimistic", "pessimistic", or "average".
|
|
136
|
+
|
|
137
|
+

|
|
138
|
+
|
|
139
|
+
Examples:
|
|
140
|
+
>>> taxonomy = Taxonomy("./data/cars-taxonomy.yaml")
|
|
141
|
+
>>> sim = user_weights("optimistic")
|
|
142
|
+
>>> sim(taxonomy, "audi", "Volkswagen AG")
|
|
143
|
+
1.0
|
|
144
|
+
>>> sim(taxonomy, "audi", "bmw")
|
|
145
|
+
0.0
|
|
146
|
+
"""
|
|
147
|
+
|
|
148
|
+
def wrapped_func(taxonomy: Taxonomy, x: str, y: str) -> float:
|
|
149
|
+
node1 = taxonomy.nodes[x]
|
|
150
|
+
node2 = taxonomy.nodes[y]
|
|
151
|
+
lca = taxonomy.lca(node1, node2)
|
|
152
|
+
weight = lca.weight
|
|
153
|
+
|
|
154
|
+
if lca == node1 or lca == node2:
|
|
155
|
+
# pessimistic not needed: weight of lca already used
|
|
156
|
+
if strategy == "optimistic":
|
|
157
|
+
weight = 1.0
|
|
158
|
+
elif strategy == "average":
|
|
159
|
+
weight = (node1.weight + node2.weight) / 2
|
|
160
|
+
|
|
161
|
+
return weight
|
|
162
|
+
|
|
163
|
+
return wrapped_func
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def auto_weights(strategy: TaxonomyStrategy) -> TaxonomyFunc:
|
|
167
|
+
"""Automatic weights similarity measure of two nodes in a taxonomy.
|
|
168
|
+
|
|
169
|
+
The weights are automatically calculated based on the depth of the nodes.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
strategy: The strategy to use in case one of the node is the lowest common ancestor (lca).
|
|
173
|
+
One of "optimistic", "pessimistic", or "average".
|
|
174
|
+
|
|
175
|
+

|
|
176
|
+
|
|
177
|
+
Examples:
|
|
178
|
+
>>> taxonomy = Taxonomy("./data/cars-taxonomy.yaml")
|
|
179
|
+
>>> sim = auto_weights("optimistic")
|
|
180
|
+
>>> sim(taxonomy, "audi", "Volkswagen AG")
|
|
181
|
+
1.0
|
|
182
|
+
>>> sim(taxonomy, "audi", "bmw")
|
|
183
|
+
0.0
|
|
184
|
+
"""
|
|
185
|
+
|
|
186
|
+
def wrapped_func(taxonomy: Taxonomy, x: str, y: str) -> float:
|
|
187
|
+
node1 = taxonomy.nodes[x]
|
|
188
|
+
node2 = taxonomy.nodes[y]
|
|
189
|
+
lca = taxonomy.lca(node1, node2)
|
|
190
|
+
max_depth = taxonomy.max_depth
|
|
191
|
+
|
|
192
|
+
weight = lca.depth / max_depth
|
|
193
|
+
|
|
194
|
+
if lca == node1 or lca == node2:
|
|
195
|
+
# pessimistic not needed: weight of lca already used
|
|
196
|
+
if strategy == "optimistic":
|
|
197
|
+
weight = 1.0
|
|
198
|
+
elif strategy == "average":
|
|
199
|
+
weight1 = node1.depth / max_depth
|
|
200
|
+
weight2 = node2.depth / max_depth
|
|
201
|
+
weight = (weight1 + weight2) / 2
|
|
202
|
+
|
|
203
|
+
return weight
|
|
204
|
+
|
|
205
|
+
return wrapped_func
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def node_levels(strategy: TaxonomyStrategy) -> TaxonomyFunc:
|
|
209
|
+
"""Node levels similarity measure of two nodes in a taxonomy.
|
|
210
|
+
|
|
211
|
+
The similarity is calculated based on the levels of the nodes.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
strategy: The strategy to use in case one of the node is the lowest common ancestor (lca).
|
|
215
|
+
One of "optimistic", "pessimistic", or "average".
|
|
216
|
+
|
|
217
|
+

|
|
218
|
+
|
|
219
|
+
Examples:
|
|
220
|
+
>>> taxonomy = Taxonomy("./data/cars-taxonomy.yaml")
|
|
221
|
+
>>> sim = node_levels("optimistic")
|
|
222
|
+
>>> sim(taxonomy, "audi", "Volkswagen AG")
|
|
223
|
+
1.0
|
|
224
|
+
>>> sim(taxonomy, "audi", "bmw")
|
|
225
|
+
0.3333333333333333
|
|
226
|
+
"""
|
|
227
|
+
|
|
228
|
+
def wrapped_func(taxonomy: Taxonomy, x: str, y: str) -> float:
|
|
229
|
+
node1 = taxonomy.nodes[x]
|
|
230
|
+
node2 = taxonomy.nodes[y]
|
|
231
|
+
lca = taxonomy.lca(node1, node2)
|
|
232
|
+
|
|
233
|
+
if strategy == "optimistic":
|
|
234
|
+
return lca.level / min(node1.level, node2.level)
|
|
235
|
+
elif strategy == "pessimistic":
|
|
236
|
+
return lca.level / max(node1.level, node2.level)
|
|
237
|
+
elif strategy == "average":
|
|
238
|
+
return lca.level / ((node1.level + node2.level) / 2)
|
|
239
|
+
else:
|
|
240
|
+
return 0.0
|
|
241
|
+
|
|
242
|
+
return wrapped_func
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def path_steps(weightUp: float = 1.0, weightDown: float = 1.0) -> TaxonomyFunc:
|
|
246
|
+
"""Path steps similarity measure of two nodes in a taxonomy.
|
|
247
|
+
|
|
248
|
+
The similarity is calculated based on the steps up and down from the lowest common ancestor (lca).
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
weightUp: The weight to use for the steps up.
|
|
252
|
+
weightDown: The weight to use for the steps down.
|
|
253
|
+
|
|
254
|
+

|
|
255
|
+
|
|
256
|
+
Examples:
|
|
257
|
+
>>> taxonomy = Taxonomy("./data/cars-taxonomy.yaml")
|
|
258
|
+
>>> sim = path_steps()
|
|
259
|
+
>>> sim(taxonomy, "audi", "Volkswagen AG")
|
|
260
|
+
0.8333333333333334
|
|
261
|
+
>>> sim(taxonomy, "audi", "bmw")
|
|
262
|
+
0.3333333333333333
|
|
263
|
+
"""
|
|
264
|
+
|
|
265
|
+
def wrapped_func(taxonomy: Taxonomy, x: str, y: str) -> float:
|
|
266
|
+
node1 = taxonomy.nodes[x]
|
|
267
|
+
node2 = taxonomy.nodes[y]
|
|
268
|
+
lca = taxonomy.lca(node1, node2)
|
|
269
|
+
|
|
270
|
+
stepsUp = node1.depth - lca.depth
|
|
271
|
+
stepsDown = node2.depth - lca.depth
|
|
272
|
+
|
|
273
|
+
weightedSteps = (stepsUp * weightUp) + (stepsDown * weightDown)
|
|
274
|
+
maxWeightedSteps = (taxonomy.max_depth * weightUp) + (
|
|
275
|
+
taxonomy.max_depth * weightDown
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
return (maxWeightedSteps - weightedSteps) / maxWeightedSteps
|
|
279
|
+
|
|
280
|
+
return wrapped_func
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
_taxonomy_func = wu_palmer()
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def load(
|
|
287
|
+
path: FilePath, measure: TaxonomyFunc = _taxonomy_func
|
|
288
|
+
) -> SimPairFunc[str, float]:
|
|
289
|
+
"""Load a taxonomy and return a function that measures the similarity.
|
|
290
|
+
|
|
291
|
+
The taxonomy is loaded from the given path and expected to conform to the following structure:
|
|
292
|
+
|
|
293
|
+
```yaml
|
|
294
|
+
name: ROOT
|
|
295
|
+
weight: 0.0
|
|
296
|
+
children:
|
|
297
|
+
- name: CHILD1
|
|
298
|
+
weight: 0.4
|
|
299
|
+
children:
|
|
300
|
+
- name: GRANDCHILD1
|
|
301
|
+
weight: 0.8
|
|
302
|
+
children:
|
|
303
|
+
- name: GREATGRANDCHILD1
|
|
304
|
+
- name: GREATGRANDCHILD2
|
|
305
|
+
- name: GRANDCHILD2
|
|
306
|
+
- name: CHILD2
|
|
307
|
+
weight: 0.6
|
|
308
|
+
children: []
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
The `name` field is required for each node, and the `children` field is optional.
|
|
312
|
+
The `weight` field is optional and can be used to assign a weight/similarity value to the node.
|
|
313
|
+
If not set, the default value is 1.0.
|
|
314
|
+
The `weight` is only used by the measure `user_weights` and ignored otherwise.
|
|
315
|
+
|
|
316
|
+
Examples:
|
|
317
|
+
>>> sim = load("./data/cars-taxonomy.yaml", measure=wu_palmer())
|
|
318
|
+
>>> sim("audi", "porsche")
|
|
319
|
+
0.5
|
|
320
|
+
>>> sim("audi", "bmw")
|
|
321
|
+
0.0
|
|
322
|
+
"""
|
|
323
|
+
taxonomy = Taxonomy(path)
|
|
324
|
+
|
|
325
|
+
def wrapped_func(x: str, y: str) -> float:
|
|
326
|
+
return measure(taxonomy, x, y)
|
|
327
|
+
|
|
328
|
+
return wrapped_func
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
from . import collections, generic, numeric, strings, taxonomy
|
|
2
|
-
from ._helpers import AbstractFloat, dist2sim, sim2map, sim2seq, unpack_sim, unpack_sims
|
|
3
|
-
|
|
4
|
-
__all__ = [
|
|
5
|
-
"collections",
|
|
6
|
-
"generic",
|
|
7
|
-
"numeric",
|
|
8
|
-
"strings",
|
|
9
|
-
"taxonomy",
|
|
10
|
-
"dist2sim",
|
|
11
|
-
"sim2map",
|
|
12
|
-
"sim2seq",
|
|
13
|
-
"unpack_sim",
|
|
14
|
-
"unpack_sims",
|
|
15
|
-
"AbstractFloat",
|
|
16
|
-
]
|
|
@@ -1,118 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass, field
|
|
2
|
-
from typing import Optional, Protocol, TypedDict, cast
|
|
3
|
-
|
|
4
|
-
from cbrkit.loaders import data as load_data
|
|
5
|
-
from cbrkit.typing import FilePath, SimPairFunc
|
|
6
|
-
|
|
7
|
-
__all__ = ["Taxonomy", "TaxonomyNode", "TaxonomyFunc", "load", "wu_palmer"]
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class SerializedNode(TypedDict, total=False):
|
|
11
|
-
key: str
|
|
12
|
-
sim: float
|
|
13
|
-
children: list["SerializedNode | str"]
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
@dataclass(slots=True)
|
|
17
|
-
class TaxonomyNode:
|
|
18
|
-
key: str
|
|
19
|
-
weight: float | None
|
|
20
|
-
depth: int
|
|
21
|
-
parent: Optional["TaxonomyNode"]
|
|
22
|
-
children: dict[str, "TaxonomyNode"] = field(default_factory=dict)
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
class Taxonomy:
|
|
26
|
-
__slots__ = ("root", "nodes")
|
|
27
|
-
|
|
28
|
-
root: TaxonomyNode
|
|
29
|
-
nodes: dict[str, TaxonomyNode]
|
|
30
|
-
|
|
31
|
-
def __init__(self, path: FilePath) -> None:
|
|
32
|
-
root_data = cast(SerializedNode, load_data(path))
|
|
33
|
-
self.nodes = {}
|
|
34
|
-
self.root = self._load(root_data)
|
|
35
|
-
|
|
36
|
-
def _load(
|
|
37
|
-
self,
|
|
38
|
-
data: SerializedNode | str,
|
|
39
|
-
parent: TaxonomyNode | None = None,
|
|
40
|
-
depth: int = 0,
|
|
41
|
-
) -> TaxonomyNode:
|
|
42
|
-
if isinstance(data, str):
|
|
43
|
-
data = {"key": data}
|
|
44
|
-
|
|
45
|
-
assert "key" in data, "Missing key in some node"
|
|
46
|
-
|
|
47
|
-
node = TaxonomyNode(
|
|
48
|
-
key=data["key"],
|
|
49
|
-
weight=data.get("weight"),
|
|
50
|
-
depth=depth,
|
|
51
|
-
parent=parent,
|
|
52
|
-
)
|
|
53
|
-
|
|
54
|
-
for child in data.get("children", []):
|
|
55
|
-
child_node = self._load(child, node, depth + 1)
|
|
56
|
-
node.children[child_node.key] = child_node
|
|
57
|
-
|
|
58
|
-
self.nodes[node.key] = node
|
|
59
|
-
|
|
60
|
-
return node
|
|
61
|
-
|
|
62
|
-
def lca(self, node1: TaxonomyNode, node2: TaxonomyNode) -> TaxonomyNode:
|
|
63
|
-
while node1 != node2:
|
|
64
|
-
if node1.parent is None or node2.parent is None:
|
|
65
|
-
return self.root
|
|
66
|
-
|
|
67
|
-
if node1.depth > node2.depth:
|
|
68
|
-
node1 = node1.parent
|
|
69
|
-
else:
|
|
70
|
-
node2 = node2.parent
|
|
71
|
-
|
|
72
|
-
return node1
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
class TaxonomyFunc(Protocol):
|
|
76
|
-
def __call__(self, taxonomy: Taxonomy, x: str, y: str) -> float:
|
|
77
|
-
...
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
def wu_palmer() -> TaxonomyFunc:
|
|
81
|
-
"""Wu & Palmer similarity measure of two nodes in a taxonomy.
|
|
82
|
-
>>> taxonomy = Taxonomy("./data/cars-taxonomy.yaml")
|
|
83
|
-
>>> sim = wu_palmer()
|
|
84
|
-
>>> sim(taxonomy, "audi", "porsche")
|
|
85
|
-
0.5
|
|
86
|
-
>>> sim(taxonomy, "audi", "bmw")
|
|
87
|
-
0.0
|
|
88
|
-
"""
|
|
89
|
-
|
|
90
|
-
def wrapped_func(taxonomy: Taxonomy, x: str, y: str) -> float:
|
|
91
|
-
node1 = taxonomy.nodes[x]
|
|
92
|
-
node2 = taxonomy.nodes[y]
|
|
93
|
-
lca = taxonomy.lca(node1, node2)
|
|
94
|
-
|
|
95
|
-
return (2 * lca.depth) / (node1.depth + node2.depth)
|
|
96
|
-
|
|
97
|
-
return wrapped_func
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
_taxonomy_func = wu_palmer()
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
def load(
|
|
104
|
-
path: FilePath, measure: TaxonomyFunc = _taxonomy_func
|
|
105
|
-
) -> SimPairFunc[str, float]:
|
|
106
|
-
"""Load a taxonomy and return a function that measures the similarity.
|
|
107
|
-
>>> sim = load("./data/cars-taxonomy.yaml", measure=wu_palmer())
|
|
108
|
-
>>> sim("audi", "porsche")
|
|
109
|
-
0.5
|
|
110
|
-
>>> sim("audi", "bmw")
|
|
111
|
-
0.0
|
|
112
|
-
"""
|
|
113
|
-
taxonomy = Taxonomy(path)
|
|
114
|
-
|
|
115
|
-
def wrapped_func(x: str, y: str) -> float:
|
|
116
|
-
return measure(taxonomy, x, y)
|
|
117
|
-
|
|
118
|
-
return wrapped_func
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|