cbrkit 0.3.2__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {cbrkit-0.3.2 → cbrkit-0.4.0}/PKG-INFO +3 -3
  2. {cbrkit-0.3.2 → cbrkit-0.4.0}/README.md +2 -2
  3. {cbrkit-0.3.2 → cbrkit-0.4.0}/cbrkit/sim/numeric.py +8 -3
  4. cbrkit-0.4.0/cbrkit/sim/taxonomy.py +328 -0
  5. {cbrkit-0.3.2 → cbrkit-0.4.0}/pyproject.toml +1 -1
  6. cbrkit-0.3.2/cbrkit/sim/taxonomy.py +0 -118
  7. {cbrkit-0.3.2 → cbrkit-0.4.0}/LICENSE +0 -0
  8. {cbrkit-0.3.2 → cbrkit-0.4.0}/cbrkit/__init__.py +0 -0
  9. {cbrkit-0.3.2 → cbrkit-0.4.0}/cbrkit/__main__.py +0 -0
  10. {cbrkit-0.3.2 → cbrkit-0.4.0}/cbrkit/api.py +0 -0
  11. {cbrkit-0.3.2 → cbrkit-0.4.0}/cbrkit/cli.py +0 -0
  12. {cbrkit-0.3.2 → cbrkit-0.4.0}/cbrkit/global_sim/__init__.py +0 -0
  13. {cbrkit-0.3.2 → cbrkit-0.4.0}/cbrkit/global_sim/_aggregate.py +0 -0
  14. {cbrkit-0.3.2 → cbrkit-0.4.0}/cbrkit/global_sim/_attribute_value.py +0 -0
  15. {cbrkit-0.3.2 → cbrkit-0.4.0}/cbrkit/global_sim/graph/__init__.py +0 -0
  16. {cbrkit-0.3.2 → cbrkit-0.4.0}/cbrkit/global_sim/graph/_astar.py +0 -0
  17. {cbrkit-0.3.2 → cbrkit-0.4.0}/cbrkit/global_sim/graph/_model.py +0 -0
  18. {cbrkit-0.3.2 → cbrkit-0.4.0}/cbrkit/helpers.py +0 -0
  19. {cbrkit-0.3.2 → cbrkit-0.4.0}/cbrkit/loaders.py +0 -0
  20. {cbrkit-0.3.2 → cbrkit-0.4.0}/cbrkit/py.typed +0 -0
  21. {cbrkit-0.3.2 → cbrkit-0.4.0}/cbrkit/retrieval.py +0 -0
  22. {cbrkit-0.3.2 → cbrkit-0.4.0}/cbrkit/sim/__init__.py +0 -0
  23. {cbrkit-0.3.2 → cbrkit-0.4.0}/cbrkit/sim/collections.py +0 -0
  24. {cbrkit-0.3.2 → cbrkit-0.4.0}/cbrkit/sim/generic.py +0 -0
  25. {cbrkit-0.3.2 → cbrkit-0.4.0}/cbrkit/sim/strings.py +0 -0
  26. {cbrkit-0.3.2 → cbrkit-0.4.0}/cbrkit/typing.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cbrkit
3
- Version: 0.3.2
3
+ Version: 0.4.0
4
4
  Summary: Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI.
5
5
  Home-page: https://wi2trier.github.io/cbrkit/
6
6
  License: MIT
@@ -135,10 +135,10 @@ queries = cbrkit.loaders.dataframe(pd.read_csv("path/to/queries.csv"))
135
135
  queries = cbrkit.loaders.json("path/to/queries.json")
136
136
  ```
137
137
 
138
- In case your query collection only contains a single query, you can use the `singleton` function to extract it.
138
+ In case your query collection only contains a single entry, you can use the `singleton` function to extract it.
139
139
 
140
140
  ```python
141
- query = cbrkit.singleton(queries)
141
+ query = cbrkit.helpers.singleton(queries)
142
142
  ```
143
143
 
144
144
  Alternatively, you can also create a query directly in Python:
@@ -86,10 +86,10 @@ queries = cbrkit.loaders.dataframe(pd.read_csv("path/to/queries.csv"))
86
86
  queries = cbrkit.loaders.json("path/to/queries.json")
87
87
  ```
88
88
 
89
- In case your query collection only contains a single query, you can use the `singleton` function to extract it.
89
+ In case your query collection only contains a single entry, you can use the `singleton` function to extract it.
90
90
 
91
91
  ```python
92
- query = cbrkit.singleton(queries)
92
+ query = cbrkit.helpers.singleton(queries)
93
93
  ```
94
94
 
95
95
  Alternatively, you can also create a query directly in Python:
@@ -15,9 +15,11 @@ def linear(max: float, min: float = 0.0) -> SimPairFunc[Number, float]:
15
15
  min: Minimum bound of the interval
16
16
 
17
17
  ![linear](../../assets/numeric/linear.png)
18
- >>> sim = linear(100)
19
- >>> sim(50, 60)
20
- 0.9
18
+
19
+ Examples:
20
+ >>> sim = linear(100)
21
+ >>> sim(50, 60)
22
+ 0.9
21
23
  """
22
24
 
23
25
  def wrapped_func(x: Number, y: Number) -> float:
@@ -40,6 +42,7 @@ def threshold(threshold: float) -> SimPairFunc[Number, float]:
40
42
  threshold: If the absolute difference between the two values is less than or equal to this value, the similarity is 1.0, otherwise it is 0.0
41
43
 
42
44
  ![threshold](../../assets/numeric/threshold.png)
45
+
43
46
  Examples:
44
47
  >>> sim = threshold(10)
45
48
  >>> sim(50, 60)
@@ -61,6 +64,7 @@ def exponential(alpha: float = 1.0) -> SimPairFunc[Number, float]:
61
64
  alpha: Controls the growth of the exponential function for the similarity. The larger alpha is, the faster the similarity decreases.
62
65
 
63
66
  ![exponential](../../assets/numeric/exponential.png)
67
+
64
68
  Examples:
65
69
  >>> sim = exponential(0.1)
66
70
  >>> sim(50, 60)
@@ -81,6 +85,7 @@ def sigmoid(alpha: float = 1.0, theta: float = 1.0) -> SimPairFunc[Number, float
81
85
  theta: Specifies the point at which the similarity value is 0.5.
82
86
 
83
87
  ![sigmoid](../../assets/numeric/sigmoid.png)
88
+
84
89
  Examples:
85
90
  >>> sim = sigmoid(1, 10)
86
91
  >>> sim(50, 60)
@@ -0,0 +1,328 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Literal, Optional, Protocol, TypedDict, cast
3
+
4
+ from cbrkit.loaders import data as load_data
5
+ from cbrkit.typing import FilePath, SimPairFunc
6
+
7
+ __all__ = [
8
+ "Taxonomy",
9
+ "TaxonomyNode",
10
+ "TaxonomyFunc",
11
+ "TaxonomyStrategy",
12
+ "load",
13
+ "wu_palmer",
14
+ "user_weights",
15
+ "auto_weights",
16
+ "node_levels",
17
+ "path_steps",
18
+ ]
19
+
20
+
21
+ class SerializedNode(TypedDict, total=False):
22
+ name: str
23
+ weight: float
24
+ children: list["SerializedNode | str"]
25
+
26
+
27
+ @dataclass(slots=True)
28
+ class TaxonomyNode:
29
+ name: str
30
+ weight: float
31
+ depth: int
32
+ parent: Optional["TaxonomyNode"]
33
+ children: dict[str, "TaxonomyNode"] = field(default_factory=dict)
34
+
35
+ @property
36
+ def level(self) -> int:
37
+ return self.depth + 1
38
+
39
+
40
+ class Taxonomy:
41
+ __slots__ = ("root", "nodes")
42
+
43
+ root: TaxonomyNode
44
+ nodes: dict[str, TaxonomyNode]
45
+
46
+ def __init__(self, path: FilePath) -> None:
47
+ root_data = cast(SerializedNode, load_data(path))
48
+ self.nodes = {}
49
+ self.root = self._load(root_data)
50
+
51
+ @property
52
+ def max_depth(self) -> int:
53
+ return max(node.depth for node in self.nodes.values())
54
+
55
+ @property
56
+ def max_level(self) -> int:
57
+ return max(node.level for node in self.nodes.values())
58
+
59
+ def _load(
60
+ self,
61
+ data: SerializedNode | str,
62
+ parent: TaxonomyNode | None = None,
63
+ depth: int = 0,
64
+ ) -> TaxonomyNode:
65
+ if isinstance(data, str):
66
+ data = {"name": data}
67
+
68
+ assert "name" in data, "Missing name in some node"
69
+
70
+ node = TaxonomyNode(
71
+ name=data["name"],
72
+ weight=data.get("weight", 1.0),
73
+ depth=depth,
74
+ parent=parent,
75
+ )
76
+
77
+ for child in data.get("children", []):
78
+ child_node = self._load(child, node, depth + 1)
79
+ node.children[child_node.name] = child_node
80
+
81
+ self.nodes[node.name] = node
82
+
83
+ return node
84
+
85
+ def lca(self, node1: TaxonomyNode, node2: TaxonomyNode) -> TaxonomyNode:
86
+ while node1 != node2:
87
+ if node1.parent is None or node2.parent is None:
88
+ return self.root
89
+
90
+ if node1.depth > node2.depth:
91
+ node1 = node1.parent
92
+ else:
93
+ node2 = node2.parent
94
+
95
+ return node1
96
+
97
+
98
+ class TaxonomyFunc(Protocol):
99
+ def __call__(self, taxonomy: Taxonomy, x: str, y: str) -> float:
100
+ ...
101
+
102
+
103
+ TaxonomyStrategy = Literal["optimistic", "pessimistic", "average"]
104
+
105
+
106
+ def wu_palmer() -> TaxonomyFunc:
107
+ """Wu & Palmer similarity measure of two nodes in a taxonomy.
108
+
109
+ Examples:
110
+ >>> taxonomy = Taxonomy("./data/cars-taxonomy.yaml")
111
+ >>> sim = wu_palmer()
112
+ >>> sim(taxonomy, "audi", "porsche")
113
+ 0.5
114
+ >>> sim(taxonomy, "audi", "bmw")
115
+ 0.0
116
+ """
117
+
118
+ def wrapped_func(taxonomy: Taxonomy, x: str, y: str) -> float:
119
+ node1 = taxonomy.nodes[x]
120
+ node2 = taxonomy.nodes[y]
121
+ lca = taxonomy.lca(node1, node2)
122
+
123
+ return (2 * lca.depth) / (node1.depth + node2.depth)
124
+
125
+ return wrapped_func
126
+
127
+
128
+ def user_weights(strategy: TaxonomyStrategy) -> TaxonomyFunc:
129
+ """User-defined weights similarity measure of two nodes in a taxonomy.
130
+
131
+ The weights are defined by the user in the taxonomy file.
132
+
133
+ Args:
134
+ strategy: The strategy to use in case one of the node is the lowest common ancestor (lca).
135
+ One of "optimistic", "pessimistic", or "average".
136
+
137
+ ![user weights](../../assets/taxonomy/user-weights.png)
138
+
139
+ Examples:
140
+ >>> taxonomy = Taxonomy("./data/cars-taxonomy.yaml")
141
+ >>> sim = user_weights("optimistic")
142
+ >>> sim(taxonomy, "audi", "Volkswagen AG")
143
+ 1.0
144
+ >>> sim(taxonomy, "audi", "bmw")
145
+ 0.0
146
+ """
147
+
148
+ def wrapped_func(taxonomy: Taxonomy, x: str, y: str) -> float:
149
+ node1 = taxonomy.nodes[x]
150
+ node2 = taxonomy.nodes[y]
151
+ lca = taxonomy.lca(node1, node2)
152
+ weight = lca.weight
153
+
154
+ if lca == node1 or lca == node2:
155
+ # pessimistic not needed: weight of lca already used
156
+ if strategy == "optimistic":
157
+ weight = 1.0
158
+ elif strategy == "average":
159
+ weight = (node1.weight + node2.weight) / 2
160
+
161
+ return weight
162
+
163
+ return wrapped_func
164
+
165
+
166
+ def auto_weights(strategy: TaxonomyStrategy) -> TaxonomyFunc:
167
+ """Automatic weights similarity measure of two nodes in a taxonomy.
168
+
169
+ The weights are automatically calculated based on the depth of the nodes.
170
+
171
+ Args:
172
+ strategy: The strategy to use in case one of the node is the lowest common ancestor (lca).
173
+ One of "optimistic", "pessimistic", or "average".
174
+
175
+ ![auto weights](../../assets/taxonomy/auto-weights.png)
176
+
177
+ Examples:
178
+ >>> taxonomy = Taxonomy("./data/cars-taxonomy.yaml")
179
+ >>> sim = auto_weights("optimistic")
180
+ >>> sim(taxonomy, "audi", "Volkswagen AG")
181
+ 1.0
182
+ >>> sim(taxonomy, "audi", "bmw")
183
+ 0.0
184
+ """
185
+
186
+ def wrapped_func(taxonomy: Taxonomy, x: str, y: str) -> float:
187
+ node1 = taxonomy.nodes[x]
188
+ node2 = taxonomy.nodes[y]
189
+ lca = taxonomy.lca(node1, node2)
190
+ max_depth = taxonomy.max_depth
191
+
192
+ weight = lca.depth / max_depth
193
+
194
+ if lca == node1 or lca == node2:
195
+ # pessimistic not needed: weight of lca already used
196
+ if strategy == "optimistic":
197
+ weight = 1.0
198
+ elif strategy == "average":
199
+ weight1 = node1.depth / max_depth
200
+ weight2 = node2.depth / max_depth
201
+ weight = (weight1 + weight2) / 2
202
+
203
+ return weight
204
+
205
+ return wrapped_func
206
+
207
+
208
+ def node_levels(strategy: TaxonomyStrategy) -> TaxonomyFunc:
209
+ """Node levels similarity measure of two nodes in a taxonomy.
210
+
211
+ The similarity is calculated based on the levels of the nodes.
212
+
213
+ Args:
214
+ strategy: The strategy to use in case one of the node is the lowest common ancestor (lca).
215
+ One of "optimistic", "pessimistic", or "average".
216
+
217
+ ![node levels](../../assets/taxonomy/node-levels.png)
218
+
219
+ Examples:
220
+ >>> taxonomy = Taxonomy("./data/cars-taxonomy.yaml")
221
+ >>> sim = node_levels("optimistic")
222
+ >>> sim(taxonomy, "audi", "Volkswagen AG")
223
+ 1.0
224
+ >>> sim(taxonomy, "audi", "bmw")
225
+ 0.3333333333333333
226
+ """
227
+
228
+ def wrapped_func(taxonomy: Taxonomy, x: str, y: str) -> float:
229
+ node1 = taxonomy.nodes[x]
230
+ node2 = taxonomy.nodes[y]
231
+ lca = taxonomy.lca(node1, node2)
232
+
233
+ if strategy == "optimistic":
234
+ return lca.level / min(node1.level, node2.level)
235
+ elif strategy == "pessimistic":
236
+ return lca.level / max(node1.level, node2.level)
237
+ elif strategy == "average":
238
+ return lca.level / ((node1.level + node2.level) / 2)
239
+ else:
240
+ return 0.0
241
+
242
+ return wrapped_func
243
+
244
+
245
+ def path_steps(weightUp: float = 1.0, weightDown: float = 1.0) -> TaxonomyFunc:
246
+ """Path steps similarity measure of two nodes in a taxonomy.
247
+
248
+ The similarity is calculated based on the steps up and down from the lowest common ancestor (lca).
249
+
250
+ Args:
251
+ weightUp: The weight to use for the steps up.
252
+ weightDown: The weight to use for the steps down.
253
+
254
+ ![path steps](../../assets/taxonomy/path-steps.png)
255
+
256
+ Examples:
257
+ >>> taxonomy = Taxonomy("./data/cars-taxonomy.yaml")
258
+ >>> sim = path_steps()
259
+ >>> sim(taxonomy, "audi", "Volkswagen AG")
260
+ 0.8333333333333334
261
+ >>> sim(taxonomy, "audi", "bmw")
262
+ 0.3333333333333333
263
+ """
264
+
265
+ def wrapped_func(taxonomy: Taxonomy, x: str, y: str) -> float:
266
+ node1 = taxonomy.nodes[x]
267
+ node2 = taxonomy.nodes[y]
268
+ lca = taxonomy.lca(node1, node2)
269
+
270
+ stepsUp = node1.depth - lca.depth
271
+ stepsDown = node2.depth - lca.depth
272
+
273
+ weightedSteps = (stepsUp * weightUp) + (stepsDown * weightDown)
274
+ maxWeightedSteps = (taxonomy.max_depth * weightUp) + (
275
+ taxonomy.max_depth * weightDown
276
+ )
277
+
278
+ return (maxWeightedSteps - weightedSteps) / maxWeightedSteps
279
+
280
+ return wrapped_func
281
+
282
+
283
+ _taxonomy_func = wu_palmer()
284
+
285
+
286
+ def load(
287
+ path: FilePath, measure: TaxonomyFunc = _taxonomy_func
288
+ ) -> SimPairFunc[str, float]:
289
+ """Load a taxonomy and return a function that measures the similarity.
290
+
291
+ The taxonomy is loaded from the given path and expected to conform to the following structure:
292
+
293
+ ```yaml
294
+ name: ROOT
295
+ weight: 0.0
296
+ children:
297
+ - name: CHILD1
298
+ weight: 0.4
299
+ children:
300
+ - name: GRANDCHILD1
301
+ weight: 0.8
302
+ children:
303
+ - name: GREATGRANDCHILD1
304
+ - name: GREATGRANDCHILD2
305
+ - name: GRANDCHILD2
306
+ - name: CHILD2
307
+ weight: 0.6
308
+ children: []
309
+ ```
310
+
311
+ The `name` field is required for each node, and the `children` field is optional.
312
+ The `weight` field is optional and can be used to assign a weight/similarity value to the node.
313
+ If not set, the default value is 1.0.
314
+ The `weight` is only used by the measure `user_weights` and ignored otherwise.
315
+
316
+ Examples:
317
+ >>> sim = load("./data/cars-taxonomy.yaml", measure=wu_palmer())
318
+ >>> sim("audi", "porsche")
319
+ 0.5
320
+ >>> sim("audi", "bmw")
321
+ 0.0
322
+ """
323
+ taxonomy = Taxonomy(path)
324
+
325
+ def wrapped_func(x: str, y: str) -> float:
326
+ return measure(taxonomy, x, y)
327
+
328
+ return wrapped_func
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "cbrkit"
3
- version = "0.3.2"
3
+ version = "0.4.0"
4
4
  description = "Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI."
5
5
  authors = ["Mirko Lenz <mirko@mirkolenz.com>"]
6
6
  license = "MIT"
@@ -1,118 +0,0 @@
1
- from dataclasses import dataclass, field
2
- from typing import Optional, Protocol, TypedDict, cast
3
-
4
- from cbrkit.loaders import data as load_data
5
- from cbrkit.typing import FilePath, SimPairFunc
6
-
7
- __all__ = ["Taxonomy", "TaxonomyNode", "TaxonomyFunc", "load", "wu_palmer"]
8
-
9
-
10
- class SerializedNode(TypedDict, total=False):
11
- key: str
12
- sim: float
13
- children: list["SerializedNode | str"]
14
-
15
-
16
- @dataclass(slots=True)
17
- class TaxonomyNode:
18
- key: str
19
- weight: float | None
20
- depth: int
21
- parent: Optional["TaxonomyNode"]
22
- children: dict[str, "TaxonomyNode"] = field(default_factory=dict)
23
-
24
-
25
- class Taxonomy:
26
- __slots__ = ("root", "nodes")
27
-
28
- root: TaxonomyNode
29
- nodes: dict[str, TaxonomyNode]
30
-
31
- def __init__(self, path: FilePath) -> None:
32
- root_data = cast(SerializedNode, load_data(path))
33
- self.nodes = {}
34
- self.root = self._load(root_data)
35
-
36
- def _load(
37
- self,
38
- data: SerializedNode | str,
39
- parent: TaxonomyNode | None = None,
40
- depth: int = 0,
41
- ) -> TaxonomyNode:
42
- if isinstance(data, str):
43
- data = {"key": data}
44
-
45
- assert "key" in data, "Missing key in some node"
46
-
47
- node = TaxonomyNode(
48
- key=data["key"],
49
- weight=data.get("weight"),
50
- depth=depth,
51
- parent=parent,
52
- )
53
-
54
- for child in data.get("children", []):
55
- child_node = self._load(child, node, depth + 1)
56
- node.children[child_node.key] = child_node
57
-
58
- self.nodes[node.key] = node
59
-
60
- return node
61
-
62
- def lca(self, node1: TaxonomyNode, node2: TaxonomyNode) -> TaxonomyNode:
63
- while node1 != node2:
64
- if node1.parent is None or node2.parent is None:
65
- return self.root
66
-
67
- if node1.depth > node2.depth:
68
- node1 = node1.parent
69
- else:
70
- node2 = node2.parent
71
-
72
- return node1
73
-
74
-
75
- class TaxonomyFunc(Protocol):
76
- def __call__(self, taxonomy: Taxonomy, x: str, y: str) -> float:
77
- ...
78
-
79
-
80
- def wu_palmer() -> TaxonomyFunc:
81
- """Wu & Palmer similarity measure of two nodes in a taxonomy.
82
- >>> taxonomy = Taxonomy("./data/cars-taxonomy.yaml")
83
- >>> sim = wu_palmer()
84
- >>> sim(taxonomy, "audi", "porsche")
85
- 0.5
86
- >>> sim(taxonomy, "audi", "bmw")
87
- 0.0
88
- """
89
-
90
- def wrapped_func(taxonomy: Taxonomy, x: str, y: str) -> float:
91
- node1 = taxonomy.nodes[x]
92
- node2 = taxonomy.nodes[y]
93
- lca = taxonomy.lca(node1, node2)
94
-
95
- return (2 * lca.depth) / (node1.depth + node2.depth)
96
-
97
- return wrapped_func
98
-
99
-
100
- _taxonomy_func = wu_palmer()
101
-
102
-
103
- def load(
104
- path: FilePath, measure: TaxonomyFunc = _taxonomy_func
105
- ) -> SimPairFunc[str, float]:
106
- """Load a taxonomy and return a function that measures the similarity.
107
- >>> sim = load("./data/cars-taxonomy.yaml", measure=wu_palmer())
108
- >>> sim("audi", "porsche")
109
- 0.5
110
- >>> sim("audi", "bmw")
111
- 0.0
112
- """
113
- taxonomy = Taxonomy(path)
114
-
115
- def wrapped_func(x: str, y: str) -> float:
116
- return measure(taxonomy, x, y)
117
-
118
- return wrapped_func
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes