cbrkit 0.3.0__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cbrkit-0.3.0 → cbrkit-0.3.2}/PKG-INFO +66 -18
- cbrkit-0.3.2/README.md +104 -0
- {cbrkit-0.3.0 → cbrkit-0.3.2}/cbrkit/__init__.py +5 -3
- {cbrkit-0.3.0 → cbrkit-0.3.2}/cbrkit/global_sim/_aggregate.py +15 -1
- {cbrkit-0.3.0 → cbrkit-0.3.2}/cbrkit/global_sim/_attribute_value.py +14 -1
- {cbrkit-0.3.0 → cbrkit-0.3.2}/cbrkit/global_sim/graph/_astar.py +13 -1
- cbrkit-0.3.0/cbrkit/sim/_helpers.py → cbrkit-0.3.2/cbrkit/helpers.py +56 -1
- {cbrkit-0.3.0 → cbrkit-0.3.2}/cbrkit/loaders.py +136 -0
- {cbrkit-0.3.0 → cbrkit-0.3.2}/cbrkit/retrieval.py +66 -1
- cbrkit-0.3.2/cbrkit/sim/__init__.py +9 -0
- {cbrkit-0.3.0 → cbrkit-0.3.2}/cbrkit/sim/collections.py +10 -1
- {cbrkit-0.3.0 → cbrkit-0.3.2}/cbrkit/sim/generic.py +27 -0
- {cbrkit-0.3.0 → cbrkit-0.3.2}/cbrkit/sim/numeric.py +20 -1
- {cbrkit-0.3.0 → cbrkit-0.3.2}/cbrkit/sim/strings.py +83 -1
- {cbrkit-0.3.0 → cbrkit-0.3.2}/cbrkit/sim/taxonomy.py +18 -0
- {cbrkit-0.3.0 → cbrkit-0.3.2}/pyproject.toml +22 -28
- cbrkit-0.3.0/README.md +0 -57
- cbrkit-0.3.0/cbrkit/sim/__init__.py +0 -16
- {cbrkit-0.3.0 → cbrkit-0.3.2}/LICENSE +0 -0
- {cbrkit-0.3.0 → cbrkit-0.3.2}/cbrkit/__main__.py +0 -0
- {cbrkit-0.3.0 → cbrkit-0.3.2}/cbrkit/api.py +0 -0
- {cbrkit-0.3.0 → cbrkit-0.3.2}/cbrkit/cli.py +0 -0
- {cbrkit-0.3.0 → cbrkit-0.3.2}/cbrkit/global_sim/__init__.py +0 -0
- {cbrkit-0.3.0 → cbrkit-0.3.2}/cbrkit/global_sim/graph/__init__.py +0 -0
- {cbrkit-0.3.0 → cbrkit-0.3.2}/cbrkit/global_sim/graph/_model.py +0 -0
- {cbrkit-0.3.0 → cbrkit-0.3.2}/cbrkit/py.typed +0 -0
- {cbrkit-0.3.0 → cbrkit-0.3.2}/cbrkit/typing.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: cbrkit
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.2
|
|
4
4
|
Summary: Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI.
|
|
5
5
|
Home-page: https://wi2trier.github.io/cbrkit/
|
|
6
6
|
License: MIT
|
|
@@ -28,20 +28,21 @@ Provides-Extra: all
|
|
|
28
28
|
Provides-Extra: api
|
|
29
29
|
Provides-Extra: cli
|
|
30
30
|
Provides-Extra: nlp
|
|
31
|
-
|
|
32
|
-
Requires-Dist:
|
|
31
|
+
Provides-Extra: transformers
|
|
32
|
+
Requires-Dist: fastapi[all] (>=0.100,<1.0) ; extra == "all" or extra == "api"
|
|
33
|
+
Requires-Dist: levenshtein (>=0.23,<1.0) ; extra == "all" or extra == "nlp"
|
|
33
34
|
Requires-Dist: nltk (>=3.8,<4.0) ; extra == "all" or extra == "nlp"
|
|
34
|
-
Requires-Dist: openai (>=1.
|
|
35
|
+
Requires-Dist: openai (>=1.5,<2.0) ; extra == "all" or extra == "nlp"
|
|
35
36
|
Requires-Dist: orjson (>=3.9,<4.0)
|
|
36
37
|
Requires-Dist: pandas (>=2.1,<3.0)
|
|
37
|
-
Requires-Dist: pyarrow (>=
|
|
38
|
+
Requires-Dist: pyarrow (>=13.0)
|
|
38
39
|
Requires-Dist: pyyaml (>=6.0,<7.0)
|
|
39
|
-
Requires-Dist: sentence-transformers (>=2.2,<3.0) ; extra == "all" or extra == "
|
|
40
|
-
Requires-Dist: spacy (>=3.7,<4.0) ; extra == "all" or extra == "nlp"
|
|
41
|
-
Requires-Dist: torch (>=2.1.1,<3.0.0) ; extra == "all" or extra == "
|
|
42
|
-
Requires-Dist: transformers (>=4.
|
|
40
|
+
Requires-Dist: sentence-transformers (>=2.2,<3.0) ; extra == "all" or extra == "transformers"
|
|
41
|
+
Requires-Dist: spacy (>=3.7,<4.0) ; extra == "all" or extra == "all" or extra == "nlp"
|
|
42
|
+
Requires-Dist: torch (>=2.1.1,<3.0.0) ; extra == "all" or extra == "transformers"
|
|
43
|
+
Requires-Dist: transformers (>=4.35,<5.0) ; extra == "all" or extra == "transformers"
|
|
43
44
|
Requires-Dist: typer[all] (>=0.9,<0.10) ; extra == "all" or extra == "cli"
|
|
44
|
-
Requires-Dist: uvicorn[standard] (>=0.24,<0
|
|
45
|
+
Requires-Dist: uvicorn[standard] (>=0.24,<1.0) ; extra == "all" or extra == "api"
|
|
45
46
|
Requires-Dist: xmltodict (>=0.13,<0.14)
|
|
46
47
|
Project-URL: Repository, https://github.com/wi2trier/cbrkit
|
|
47
48
|
Description-Content-Type: text/markdown
|
|
@@ -64,11 +65,6 @@ Description-Content-Type: text/markdown
|
|
|
64
65
|
|
|
65
66
|
# CBRkit
|
|
66
67
|
|
|
67
|
-
> [!caution]
|
|
68
|
-
> The project is under active development and does not yet adhere to semantic versioning.
|
|
69
|
-
> Breaking changes may occur at any time for versions `0.x.y`.
|
|
70
|
-
> Once the project reaches version `1.0`, semantic versioning will be applied.
|
|
71
|
-
|
|
72
68
|
## Installation
|
|
73
69
|
|
|
74
70
|
The library is available on [PyPI](https://pypi.org/project/cbrkit/), so you can install it with `pip`:
|
|
@@ -85,7 +81,8 @@ pip install cbrkit[EXTRA_NAME,...]
|
|
|
85
81
|
|
|
86
82
|
where `EXTRA_NAME` is one of the following:
|
|
87
83
|
|
|
88
|
-
- `nlp`:
|
|
84
|
+
- `nlp`: Standalone NLP tools `levenshtein`, `nltk`, `openai`, and `spacy`
|
|
85
|
+
- `transformers`: NLP tools based on `pytorch` and `transformers`
|
|
89
86
|
- `cli`: Command Line Interface (CLI)
|
|
90
87
|
- `api`: REST API Server
|
|
91
88
|
- `all`: All of the above
|
|
@@ -95,12 +92,63 @@ where `EXTRA_NAME` is one of the following:
|
|
|
95
92
|
CBRkit allows the definition of similarity metrics through _composition_.
|
|
96
93
|
This means that you can easily build even complex similarities by mixing built-in and/or custom measures.
|
|
97
94
|
CBRkit also includes predefined aggregation functions.
|
|
98
|
-
|
|
99
|
-
|
|
95
|
+
To get started, we provide a [demo project](https://github.com/wi2trier/cbrkit-demo) that shows how to use the library in a real-world scenario.
|
|
100
96
|
The following modules are part of CBRkit:
|
|
101
97
|
|
|
98
|
+
- `loaders`: Functions for loading cases and queries.
|
|
102
99
|
- `sim`: Similarity generator functions for various data types (e.g., strings, numbers).
|
|
103
100
|
- `global_sim`: Similarity generator functions for aggregating the above ones.
|
|
104
101
|
- `retrieval`: Functions for retrieving cases based on a query.
|
|
105
102
|
- `typing`: Generic type definitions for defining custom functions.
|
|
106
103
|
|
|
104
|
+
CBRkit is fully typed, so IDEs like VSCode and PyCharm can provide autocompletion and type checking.
|
|
105
|
+
We will explain all modules and their basic usage in the following sections.
|
|
106
|
+
|
|
107
|
+
### Loading Cases and Queries
|
|
108
|
+
|
|
109
|
+
The first step is to load cases and queries.
|
|
110
|
+
We provide predefined functions for the most common formats like CSV, JSON, and XML.
|
|
111
|
+
Additionally, `cbrkit` also integrates with `pandas` for loading data frames.
|
|
112
|
+
The following example shows how to load cases and queries from a CSV file using `pandas`:
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
import pandas as pd
|
|
116
|
+
import cbrkit
|
|
117
|
+
|
|
118
|
+
df = pd.read_csv("path/to/cases.csv")
|
|
119
|
+
cases = cbrkit.loaders.dataframe(df)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
When dealing with formats like JSON, the files can be loaded directly:
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
cases = cbrkit.loaders.json("path/to/cases.json")
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
Queries can either be loaded using the same loader functions.
|
|
129
|
+
CBRkit expects the type of the queries to match the type of the cases.
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
# for pandas
|
|
133
|
+
queries = cbrkit.loaders.dataframe(pd.read_csv("path/to/queries.csv"))
|
|
134
|
+
# for json
|
|
135
|
+
queries = cbrkit.loaders.json("path/to/queries.json")
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
In case your query collection only contains a single query, you can use the `singleton` function to extract it.
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
query = cbrkit.singleton(queries)
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
Alternatively, you can also create a query directly in Python:
|
|
145
|
+
|
|
146
|
+
```python
|
|
147
|
+
# for pandas
|
|
148
|
+
query = pd.Series({"name": "John", "age": 25})
|
|
149
|
+
# for json
|
|
150
|
+
query = {"name": "John", "age": 25}
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
### Similarity Measures and Aggregation
|
|
154
|
+
|
cbrkit-0.3.2/README.md
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
<!-- markdownlint-disable MD033 MD041 -->
|
|
2
|
+
<h2><p align="center">CBRkit</p></h2>
|
|
3
|
+
<p align="center">
|
|
4
|
+
<img width="256px" alt="cbrkit logo" src="https://raw.githubusercontent.com/wi2trier/cbrkit/main/assets/logo.png" />
|
|
5
|
+
</p>
|
|
6
|
+
<p align="center">
|
|
7
|
+
<a href="https://pypi.org/project/cbrkit/">PyPI</a> |
|
|
8
|
+
<a href="https://wi2trier.github.io/cbrkit/">Docs</a> |
|
|
9
|
+
<a href="https://github.com/wi2trier/cbrkit/tree/main/tests/test_retrieve.py">Example</a>
|
|
10
|
+
</p>
|
|
11
|
+
<p align="center">
|
|
12
|
+
Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI.
|
|
13
|
+
</p>
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
# CBRkit
|
|
18
|
+
|
|
19
|
+
## Installation
|
|
20
|
+
|
|
21
|
+
The library is available on [PyPI](https://pypi.org/project/cbrkit/), so you can install it with `pip`:
|
|
22
|
+
|
|
23
|
+
```shell
|
|
24
|
+
pip install cbrkit
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
It comes with several optional dependencies for certain tasks like NLP which can be installed with:
|
|
28
|
+
|
|
29
|
+
```shell
|
|
30
|
+
pip install cbrkit[EXTRA_NAME,...]
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
where `EXTRA_NAME` is one of the following:
|
|
34
|
+
|
|
35
|
+
- `nlp`: Standalone NLP tools `levenshtein`, `nltk`, `openai`, and `spacy`
|
|
36
|
+
- `transformers`: NLP tools based on `pytorch` and `transformers`
|
|
37
|
+
- `cli`: Command Line Interface (CLI)
|
|
38
|
+
- `api`: REST API Server
|
|
39
|
+
- `all`: All of the above
|
|
40
|
+
|
|
41
|
+
## Usage
|
|
42
|
+
|
|
43
|
+
CBRkit allows the definition of similarity metrics through _composition_.
|
|
44
|
+
This means that you can easily build even complex similarities by mixing built-in and/or custom measures.
|
|
45
|
+
CBRkit also includes predefined aggregation functions.
|
|
46
|
+
To get started, we provide a [demo project](https://github.com/wi2trier/cbrkit-demo) that shows how to use the library in a real-world scenario.
|
|
47
|
+
The following modules are part of CBRkit:
|
|
48
|
+
|
|
49
|
+
- `loaders`: Functions for loading cases and queries.
|
|
50
|
+
- `sim`: Similarity generator functions for various data types (e.g., strings, numbers).
|
|
51
|
+
- `global_sim`: Similarity generator functions for aggregating the above ones.
|
|
52
|
+
- `retrieval`: Functions for retrieving cases based on a query.
|
|
53
|
+
- `typing`: Generic type definitions for defining custom functions.
|
|
54
|
+
|
|
55
|
+
CBRkit is fully typed, so IDEs like VSCode and PyCharm can provide autocompletion and type checking.
|
|
56
|
+
We will explain all modules and their basic usage in the following sections.
|
|
57
|
+
|
|
58
|
+
### Loading Cases and Queries
|
|
59
|
+
|
|
60
|
+
The first step is to load cases and queries.
|
|
61
|
+
We provide predefined functions for the most common formats like CSV, JSON, and XML.
|
|
62
|
+
Additionally, `cbrkit` also integrates with `pandas` for loading data frames.
|
|
63
|
+
The following example shows how to load cases and queries from a CSV file using `pandas`:
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
import pandas as pd
|
|
67
|
+
import cbrkit
|
|
68
|
+
|
|
69
|
+
df = pd.read_csv("path/to/cases.csv")
|
|
70
|
+
cases = cbrkit.loaders.dataframe(df)
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
When dealing with formats like JSON, the files can be loaded directly:
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
cases = cbrkit.loaders.json("path/to/cases.json")
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
Queries can either be loaded using the same loader functions.
|
|
80
|
+
CBRkit expects the type of the queries to match the type of the cases.
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
# for pandas
|
|
84
|
+
queries = cbrkit.loaders.dataframe(pd.read_csv("path/to/queries.csv"))
|
|
85
|
+
# for json
|
|
86
|
+
queries = cbrkit.loaders.json("path/to/queries.json")
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
In case your query collection only contains a single query, you can use the `singleton` function to extract it.
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
query = cbrkit.singleton(queries)
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Alternatively, you can also create a query directly in Python:
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
# for pandas
|
|
99
|
+
query = pd.Series({"name": "John", "age": 25})
|
|
100
|
+
# for json
|
|
101
|
+
query = {"name": "John", "age": 25}
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### Similarity Measures and Aggregation
|
|
@@ -5,12 +5,14 @@
|
|
|
5
5
|
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
-
from . import global_sim, loaders, retrieval, sim, typing
|
|
9
8
|
|
|
10
|
-
|
|
9
|
+
from . import global_sim, helpers, loaders, retrieval, sim, typing
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
11
12
|
"loaders",
|
|
12
13
|
"sim",
|
|
13
14
|
"global_sim",
|
|
14
15
|
"typing",
|
|
15
16
|
"retrieval",
|
|
16
|
-
|
|
17
|
+
"helpers",
|
|
18
|
+
]
|
|
@@ -2,7 +2,7 @@ import statistics
|
|
|
2
2
|
from collections.abc import Mapping, Sequence
|
|
3
3
|
from typing import Literal
|
|
4
4
|
|
|
5
|
-
from cbrkit.
|
|
5
|
+
from cbrkit.helpers import unpack_sim
|
|
6
6
|
from cbrkit.typing import (
|
|
7
7
|
AggregatorFunc,
|
|
8
8
|
AnyFloat,
|
|
@@ -51,6 +51,20 @@ def aggregator(
|
|
|
51
51
|
pooling_weights: SimSeqOrMap[KeyType, float] | None = None,
|
|
52
52
|
default_pooling_weight: float = 1.0,
|
|
53
53
|
) -> AggregatorFunc[KeyType, AnyFloat]:
|
|
54
|
+
"""
|
|
55
|
+
Aggregates local similarities to a global similarity using the specified pooling function.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
pooling: The pooling function to use. It can be either a string representing the name of the pooling function or a custom pooling function (see `cbrkit.typing.PoolingFunc`).
|
|
59
|
+
pooling_weights: The weights to apply to the similarities during pooling. It can be a sequence or a mapping. If None, every local similarity is weighted equally.
|
|
60
|
+
default_pooling_weight: The default weight to use if a similarity key is not found in the pooling_weights mapping.
|
|
61
|
+
|
|
62
|
+
Examples:
|
|
63
|
+
>>> global_sim = aggregator("mean")
|
|
64
|
+
>>> global_sim([0.5, 0.75, 1.0])
|
|
65
|
+
0.75
|
|
66
|
+
"""
|
|
67
|
+
|
|
54
68
|
pooling_func = _pooling_funcs[pooling] if isinstance(pooling, str) else pooling
|
|
55
69
|
|
|
56
70
|
def wrapped_func(similarities: SimSeqOrMap[KeyType, AnyFloat]) -> float:
|
|
@@ -5,7 +5,7 @@ from typing import Any, Generic
|
|
|
5
5
|
|
|
6
6
|
import pandas as pd
|
|
7
7
|
|
|
8
|
-
from cbrkit.
|
|
8
|
+
from cbrkit.helpers import sim2map
|
|
9
9
|
from cbrkit.typing import (
|
|
10
10
|
AggregatorFunc,
|
|
11
11
|
AnySimFunc,
|
|
@@ -60,6 +60,19 @@ def attribute_value(
|
|
|
60
60
|
value_getter: Callable[[Any, str], Any] = _value_getter,
|
|
61
61
|
key_getter: Callable[[Any], Iterator[str]] = _key_getter,
|
|
62
62
|
) -> SimMapFunc[Any, AttributeValueData, AttributeValueSim[SimType]]:
|
|
63
|
+
"""
|
|
64
|
+
Similarity function that computes the attribute value similarity between two cases.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
attributes: A mapping of attribute names to the similarity functions to be used for those attributes. Takes precedence over types.
|
|
68
|
+
types: A mapping of attribute types to the similarity functions to be used for those types.
|
|
69
|
+
types_fallback: A similarity function to be used as a fallback when no specific similarity function
|
|
70
|
+
is defined for an attribute type.
|
|
71
|
+
aggregator: A function that aggregates the local similarity scores for each attribute into a single global similarity.
|
|
72
|
+
value_getter: A function that retrieves the value of an attribute from a case.
|
|
73
|
+
key_getter: A function that retrieves the attribute names from a target case.
|
|
74
|
+
"""
|
|
75
|
+
|
|
63
76
|
attributes_map: Mapping[str, AnySimFunc[KeyType, Any, SimType]] = (
|
|
64
77
|
{} if attributes is None else attributes
|
|
65
78
|
)
|
|
@@ -16,7 +16,7 @@ from cbrkit.global_sim.graph._model import (
|
|
|
16
16
|
NodeData,
|
|
17
17
|
NodeKey,
|
|
18
18
|
)
|
|
19
|
-
from cbrkit.
|
|
19
|
+
from cbrkit.helpers import unpack_sims
|
|
20
20
|
from cbrkit.typing import Casebase, FloatProtocol, KeyType, SimPairFunc, SimType
|
|
21
21
|
|
|
22
22
|
logger = logging.getLogger(__name__)
|
|
@@ -149,6 +149,18 @@ def astar(
|
|
|
149
149
|
edge_sim_func: SimPairFunc[EdgeData, SimType],
|
|
150
150
|
queue_limit: int,
|
|
151
151
|
) -> dict[KeyType, GraphSim[GraphData, NodeKey, NodeData, EdgeKey, EdgeData]]:
|
|
152
|
+
"""
|
|
153
|
+
Performs the A* algorithm proposed by [Bergmann and Gil (2014)](https://doi.org/10.1016/j.is.2012.07.005) to compute the similarity between a query graph and the graphs in the casebase.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
x_map: A casebase of graphs
|
|
157
|
+
y: Query graph
|
|
158
|
+
node_sim_func: A similarity function for graph nodes
|
|
159
|
+
edge_sim_func: A similarity function for graph edges
|
|
160
|
+
queue_limit: Limits the queue size which prunes the search space. This leads to a faster search and less memory usage but also introduces a similarity error.
|
|
161
|
+
|
|
162
|
+
"""
|
|
163
|
+
|
|
152
164
|
results = {
|
|
153
165
|
key: _astar_single(
|
|
154
166
|
x,
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from abc import ABC
|
|
2
|
-
from collections.abc import Iterable, Mapping, Sequence
|
|
2
|
+
from collections.abc import Collection, Iterable, Mapping, Sequence
|
|
3
3
|
from inspect import signature as inspect_signature
|
|
4
4
|
from typing import Any, cast
|
|
5
5
|
|
|
@@ -22,16 +22,71 @@ __all__ = [
|
|
|
22
22
|
"unpack_sim",
|
|
23
23
|
"unpack_sims",
|
|
24
24
|
"AbstractFloat",
|
|
25
|
+
"singleton",
|
|
25
26
|
]
|
|
26
27
|
|
|
27
28
|
|
|
29
|
+
def singleton(x: Mapping[Any, ValueType] | Collection[ValueType]) -> ValueType:
|
|
30
|
+
"""
|
|
31
|
+
Return the only element of the input, or raise an error if there are multiple elements.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
x: The input collection or mapping.
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
The only element of the input.
|
|
38
|
+
|
|
39
|
+
Examples:
|
|
40
|
+
>>> singleton([1])
|
|
41
|
+
1
|
|
42
|
+
>>> singleton({1: "a"})
|
|
43
|
+
'a'
|
|
44
|
+
|
|
45
|
+
Raises:
|
|
46
|
+
ValueError: If the input has more than one element.
|
|
47
|
+
TypeError: If the input is not a collection or mapping.
|
|
48
|
+
"""
|
|
49
|
+
if len(x) != 1:
|
|
50
|
+
raise ValueError(f"Expected exactly one element, but got {len(x)}")
|
|
51
|
+
|
|
52
|
+
if isinstance(x, Mapping):
|
|
53
|
+
return next(iter(x.values()))
|
|
54
|
+
elif isinstance(x, Collection):
|
|
55
|
+
return next(iter(x))
|
|
56
|
+
else:
|
|
57
|
+
raise TypeError(f"Expected a Mapping or Collection, but got {type(x)}")
|
|
58
|
+
|
|
59
|
+
|
|
28
60
|
def dist2sim(distance: float) -> float:
|
|
61
|
+
"""Convert a distance to a similarity.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
distance: The distance to convert
|
|
65
|
+
|
|
66
|
+
Examples:
|
|
67
|
+
>>> dist2sim(1.)
|
|
68
|
+
0.5
|
|
69
|
+
"""
|
|
29
70
|
return 1 / (1 + distance)
|
|
30
71
|
|
|
31
72
|
|
|
32
73
|
def sim2seq(
|
|
33
74
|
func: SimPairFunc[ValueType, SimType] | SimSeqFunc[ValueType, SimType],
|
|
34
75
|
) -> SimSeqFunc[ValueType, SimType]:
|
|
76
|
+
"""
|
|
77
|
+
Converts a similarity function that operates on pairs of values into a similarity function that operates on sequences of values.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
func: The similarity function to be converted.
|
|
81
|
+
|
|
82
|
+
Examples:
|
|
83
|
+
>>> def sim_func(x: int, y: int) -> float:
|
|
84
|
+
... return abs(x - y) / max(x, y)
|
|
85
|
+
...
|
|
86
|
+
>>> seq_func = sim2seq(sim_func)
|
|
87
|
+
>>> seq_func([(1, 2), (3, 4), (5, 6)])
|
|
88
|
+
[0.5, 0.25, 0.16666666666666666]
|
|
89
|
+
"""
|
|
35
90
|
signature = inspect_signature(func)
|
|
36
91
|
|
|
37
92
|
if len(signature.parameters) == 2:
|
|
@@ -76,10 +76,35 @@ class DataFrameCasebase(abc.Mapping):
|
|
|
76
76
|
|
|
77
77
|
|
|
78
78
|
def dataframe(df: DataFrame) -> Casebase[Any, pd.Series]:
|
|
79
|
+
"""Converts a pandas DataFrame into a Casebase.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
df: pandas DataFrame.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
Returns a Casebase as a DataFrameCasebase.
|
|
86
|
+
|
|
87
|
+
Examples:
|
|
88
|
+
>>> file_path = "./data/cars-1k.csv"
|
|
89
|
+
>>> df = pd.read_csv(file_path)
|
|
90
|
+
>>> result = dataframe(df)
|
|
91
|
+
"""
|
|
79
92
|
return DataFrameCasebase(df)
|
|
80
93
|
|
|
81
94
|
|
|
82
95
|
def csv(path: FilePath) -> dict[int, dict[str, str]]:
|
|
96
|
+
"""Reads a csv file and converts it into a dict representation
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
path: File path of the csv file
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
Dict representation of the csv file.
|
|
103
|
+
|
|
104
|
+
Examples:
|
|
105
|
+
>>> file_path = "./data/cars-1k.csv"
|
|
106
|
+
>>> result = csv(file_path)
|
|
107
|
+
"""
|
|
83
108
|
data: dict[int, dict[str, str]] = {}
|
|
84
109
|
|
|
85
110
|
with open(path) as fp:
|
|
@@ -99,6 +124,18 @@ def _csv_pandas(path: FilePath) -> dict[int, pd.Series]:
|
|
|
99
124
|
|
|
100
125
|
|
|
101
126
|
def json(path: FilePath) -> dict[Any, Any]:
|
|
127
|
+
"""Reads a json file and converts it into a dict representation
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
path: File path of the json file
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
Dict representation of the json file.
|
|
134
|
+
|
|
135
|
+
Examples:
|
|
136
|
+
>>> file_path = "data/cars-1k.json" # doctest: +SKIP
|
|
137
|
+
>>> json(file_path) # doctest: +SKIP
|
|
138
|
+
"""
|
|
102
139
|
with open(path, "rb") as fp:
|
|
103
140
|
data = orjson.loads(fp.read())
|
|
104
141
|
|
|
@@ -111,11 +148,35 @@ def json(path: FilePath) -> dict[Any, Any]:
|
|
|
111
148
|
|
|
112
149
|
|
|
113
150
|
def toml(path: FilePath) -> dict[str, Any]:
|
|
151
|
+
"""Reads a toml file and parses it into a dict representation
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
path: File path of the toml file
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
Dict representation of the toml file.
|
|
158
|
+
|
|
159
|
+
Examples:
|
|
160
|
+
>>> file_path = "./data/file.toml" # doctest: +SKIP
|
|
161
|
+
>>> toml(file_path) # doctest: +SKIP
|
|
162
|
+
"""
|
|
114
163
|
with open(path, "rb") as fp:
|
|
115
164
|
return tomllib.load(fp)
|
|
116
165
|
|
|
117
166
|
|
|
118
167
|
def yaml(path: FilePath) -> dict[Any, Any]:
|
|
168
|
+
"""Reads a yaml file and parses it into a dict representation
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
path: File path of the yaml file
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
Dict representation of the yaml file.
|
|
175
|
+
|
|
176
|
+
Examples:
|
|
177
|
+
>>> file_path = "./data/cars-1k.yaml"
|
|
178
|
+
>>> result = yaml(file_path)
|
|
179
|
+
"""
|
|
119
180
|
data: dict[Any, Any] = {}
|
|
120
181
|
|
|
121
182
|
with open(path, "rb") as fp:
|
|
@@ -132,11 +193,35 @@ def yaml(path: FilePath) -> dict[Any, Any]:
|
|
|
132
193
|
|
|
133
194
|
|
|
134
195
|
def txt(path: FilePath) -> str:
|
|
196
|
+
"""Reads a text file and converts it into a string
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
path: File path of the text file
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
String representation of the text file.
|
|
203
|
+
|
|
204
|
+
Examples:
|
|
205
|
+
>>> file_path = "data/file.txt" # doctest: +SKIP
|
|
206
|
+
>>> txt(file_path) # doctest: +SKIP
|
|
207
|
+
"""
|
|
135
208
|
with open(path) as fp:
|
|
136
209
|
return fp.read()
|
|
137
210
|
|
|
138
211
|
|
|
139
212
|
def xml(path: FilePath) -> dict[str, Any]:
|
|
213
|
+
"""Reads a xml file and parses it into a dict representation
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
path: File path of the xml file
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
Dict representation of the xml file.
|
|
220
|
+
|
|
221
|
+
Examples:
|
|
222
|
+
>>> file_path = "data/file.xml" # doctest: +SKIP
|
|
223
|
+
>>> result = xml(file_path) # doctest: +SKIP
|
|
224
|
+
"""
|
|
140
225
|
with open(path, "rb") as fp:
|
|
141
226
|
data = xmltodict.parse(fp.read())
|
|
142
227
|
|
|
@@ -174,6 +259,18 @@ _single_loaders: dict[str, SingleLoader] = {
|
|
|
174
259
|
|
|
175
260
|
|
|
176
261
|
def data(path: FilePath) -> dict[str, Any]:
|
|
262
|
+
"""Reads files of types json, toml, yaml, and yml and parses it into a dict representation
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
path: Path of the file
|
|
266
|
+
|
|
267
|
+
Returns:
|
|
268
|
+
Dict representation of the file.
|
|
269
|
+
|
|
270
|
+
Examples:
|
|
271
|
+
>>> yaml_file = "./data/cars-1k.yaml"
|
|
272
|
+
>>> result = data(yaml_file)
|
|
273
|
+
"""
|
|
177
274
|
if isinstance(path, str):
|
|
178
275
|
path = Path(path)
|
|
179
276
|
|
|
@@ -185,6 +282,18 @@ def data(path: FilePath) -> dict[str, Any]:
|
|
|
185
282
|
|
|
186
283
|
|
|
187
284
|
def path(path: FilePath, pattern: str | None = None) -> Casebase[Any, Any]:
|
|
285
|
+
"""Converts a path into a Casebase. The path can be a folder or a file.
|
|
286
|
+
|
|
287
|
+
Args:
|
|
288
|
+
path: Path of the file.
|
|
289
|
+
|
|
290
|
+
Returns:
|
|
291
|
+
Returns a Casebase.
|
|
292
|
+
|
|
293
|
+
Examples:
|
|
294
|
+
>>> file_path = "./data/cars-1k.csv"
|
|
295
|
+
>>> result = path(file_path)
|
|
296
|
+
"""
|
|
188
297
|
if isinstance(path, str):
|
|
189
298
|
path = Path(path)
|
|
190
299
|
|
|
@@ -204,6 +313,19 @@ def path(path: FilePath, pattern: str | None = None) -> Casebase[Any, Any]:
|
|
|
204
313
|
|
|
205
314
|
|
|
206
315
|
def file(path: Path) -> Casebase[Any, Any] | None:
|
|
316
|
+
"""Converts a file into a Casebase. The file can be of type csv, json, toml, yaml, or yml.
|
|
317
|
+
|
|
318
|
+
Args:
|
|
319
|
+
path: Path of the file.
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
Returns a Casebase.
|
|
323
|
+
|
|
324
|
+
Examples:
|
|
325
|
+
>>> from pathlib import Path
|
|
326
|
+
>>> file_path = Path("./data/cars-1k.csv")
|
|
327
|
+
>>> result = file(file_path)
|
|
328
|
+
"""
|
|
207
329
|
if path.suffix not in _batch_loaders:
|
|
208
330
|
return None
|
|
209
331
|
|
|
@@ -214,6 +336,20 @@ def file(path: Path) -> Casebase[Any, Any] | None:
|
|
|
214
336
|
|
|
215
337
|
|
|
216
338
|
def folder(path: Path, pattern: str) -> Casebase[Any, Any] | None:
|
|
339
|
+
"""Converts the files of a folder into a Casebase. The files can be of type txt, csv, json, toml, yaml, or yml.
|
|
340
|
+
|
|
341
|
+
Args:
|
|
342
|
+
path: Path of the folder.
|
|
343
|
+
pattern: Relative pattern for the files.
|
|
344
|
+
|
|
345
|
+
Returns:
|
|
346
|
+
Returns a Casebase.
|
|
347
|
+
|
|
348
|
+
Examples:
|
|
349
|
+
>>> from pathlib import Path
|
|
350
|
+
>>> folder_path = Path("./data")
|
|
351
|
+
>>> result = folder(folder_path, ".csv")
|
|
352
|
+
"""
|
|
217
353
|
cb: Casebase[Any, Any] = {}
|
|
218
354
|
|
|
219
355
|
for file in path.glob(pattern):
|
|
@@ -2,8 +2,8 @@ from collections.abc import Callable, Collection, Mapping, Sequence
|
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
from typing import Any, Generic
|
|
4
4
|
|
|
5
|
+
from cbrkit.helpers import sim2map, unpack_sim
|
|
5
6
|
from cbrkit.loaders import python as load_python
|
|
6
|
-
from cbrkit.sim._helpers import sim2map, unpack_sim
|
|
7
7
|
from cbrkit.typing import (
|
|
8
8
|
AnySimFunc,
|
|
9
9
|
Casebase,
|
|
@@ -78,6 +78,40 @@ def apply(
|
|
|
78
78
|
retrievers: RetrieveFunc[KeyType, ValueType, SimType]
|
|
79
79
|
| Sequence[RetrieveFunc[KeyType, ValueType, SimType]],
|
|
80
80
|
) -> Result[KeyType, ValueType, SimType]:
|
|
81
|
+
"""Applies a query to a Casebase using retriever functions.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
casebase: The casebase for the query.
|
|
85
|
+
query: The query that will be applied to the casebase
|
|
86
|
+
retrievers: Retriever functions that will retrieve similar cases (compared to the query) from the casebase
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
Returns an object of type Result.
|
|
90
|
+
|
|
91
|
+
Examples:
|
|
92
|
+
>>> import cbrkit
|
|
93
|
+
>>> import pandas as pd
|
|
94
|
+
>>> df = pd.read_csv("./data/cars-1k.csv")
|
|
95
|
+
>>> casebase = cbrkit.loaders.dataframe(df)
|
|
96
|
+
>>> query = casebase[42]
|
|
97
|
+
>>> retriever = cbrkit.retrieval.build(
|
|
98
|
+
... cbrkit.global_sim.attribute_value(
|
|
99
|
+
... attributes={
|
|
100
|
+
... "price": cbrkit.sim.numeric.linear(max=100000),
|
|
101
|
+
... "year": cbrkit.sim.numeric.linear(max=50),
|
|
102
|
+
... "manufacturer": cbrkit.sim.taxonomy.load(
|
|
103
|
+
... "./data/cars-taxonomy.yaml",
|
|
104
|
+
... measure=cbrkit.sim.taxonomy.wu_palmer(),
|
|
105
|
+
... ),
|
|
106
|
+
... "miles": cbrkit.sim.numeric.linear(max=1000000),
|
|
107
|
+
... },
|
|
108
|
+
... types_fallback=cbrkit.sim.generic.equality(),
|
|
109
|
+
... aggregator=cbrkit.global_sim.aggregator(pooling="mean"),
|
|
110
|
+
... ),
|
|
111
|
+
... limit=5,
|
|
112
|
+
... )
|
|
113
|
+
>>> result = cbrkit.retrieval.apply(casebase, query, retriever)
|
|
114
|
+
"""
|
|
81
115
|
if not isinstance(retrievers, Sequence):
|
|
82
116
|
retrievers = [retrievers]
|
|
83
117
|
|
|
@@ -99,6 +133,37 @@ def build(
|
|
|
99
133
|
similarity_func: AnySimFunc[KeyType, ValueType, SimType],
|
|
100
134
|
limit: int | None = None,
|
|
101
135
|
) -> RetrieveFunc[KeyType, ValueType, SimType]:
|
|
136
|
+
"""Based on the similarity function this function creates a retriever function.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
similarity_func: Similarity function to compute the similarity between cases.
|
|
140
|
+
limit: Retriever function will return the top limit cases.
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
Returns the retriever function.
|
|
144
|
+
|
|
145
|
+
Examples:
|
|
146
|
+
>>> import cbrkit
|
|
147
|
+
>>> retriever = cbrkit.retrieval.build(
|
|
148
|
+
... cbrkit.global_sim.attribute_value(
|
|
149
|
+
... attributes={
|
|
150
|
+
... "price": cbrkit.sim.numeric.linear(max=100000),
|
|
151
|
+
... "year": cbrkit.sim.numeric.linear(max=50),
|
|
152
|
+
... "model": cbrkit.global_sim.attribute_value(
|
|
153
|
+
... attributes={
|
|
154
|
+
... "make": cbrkit.sim.generic.equality(),
|
|
155
|
+
... "manufacturer": cbrkit.sim.taxonomy.load(
|
|
156
|
+
... "./data/cars-taxonomy.yaml",
|
|
157
|
+
... measure=cbrkit.sim.taxonomy.wu_palmer(),
|
|
158
|
+
... ),
|
|
159
|
+
... }
|
|
160
|
+
... ),
|
|
161
|
+
... },
|
|
162
|
+
... aggregator=cbrkit.global_sim.aggregator(pooling="mean"),
|
|
163
|
+
... ),
|
|
164
|
+
... limit=5,
|
|
165
|
+
... )
|
|
166
|
+
"""
|
|
102
167
|
sim_func = sim2map(similarity_func)
|
|
103
168
|
|
|
104
169
|
def wrapped_func(
|
|
@@ -1,11 +1,20 @@
|
|
|
1
1
|
from collections.abc import Collection, Set
|
|
2
2
|
from typing import Any
|
|
3
3
|
|
|
4
|
-
from cbrkit.
|
|
4
|
+
from cbrkit.helpers import dist2sim
|
|
5
5
|
from cbrkit.typing import SimPairFunc
|
|
6
6
|
|
|
7
|
+
__all__ = ["jaccard"]
|
|
8
|
+
|
|
7
9
|
|
|
8
10
|
def jaccard() -> SimPairFunc[Collection[Any], float]:
|
|
11
|
+
"""Jaccard similarity function.
|
|
12
|
+
|
|
13
|
+
Examples:
|
|
14
|
+
>>> sim = jaccard()
|
|
15
|
+
>>> sim(["a", "b", "c", "d"], ["a", "b", "c"])
|
|
16
|
+
0.8
|
|
17
|
+
"""
|
|
9
18
|
from nltk.metrics import jaccard_distance
|
|
10
19
|
|
|
11
20
|
def wrapped_func(x: Collection[Any], y: Collection[Any]) -> float:
|
|
@@ -7,12 +7,29 @@ from cbrkit.typing import (
|
|
|
7
7
|
ValueType,
|
|
8
8
|
)
|
|
9
9
|
|
|
10
|
+
__all__ = ["table", "equality"]
|
|
11
|
+
|
|
10
12
|
|
|
11
13
|
def table(
|
|
12
14
|
entries: Sequence[tuple[ValueType, ValueType, float]],
|
|
13
15
|
symmetric: bool = True,
|
|
14
16
|
default: float = 0.0,
|
|
15
17
|
) -> SimPairFunc[ValueType, float]:
|
|
18
|
+
"""Allows to import a similarity values from a table.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
entries: Sequence[tuple[a, b, sim(a, b)]
|
|
22
|
+
symmetric: If True, the table is assumed to be symmetric, i.e. sim(a, b) = sim(b, a)
|
|
23
|
+
default: Default similarity value for pairs not in the table
|
|
24
|
+
|
|
25
|
+
Examples:
|
|
26
|
+
>>> sim = table([("a", "b", 0.5), ("b", "c", 0.7)], symmetric=True, default=0.0)
|
|
27
|
+
>>> sim("b", "a")
|
|
28
|
+
0.5
|
|
29
|
+
>>> sim("a", "c")
|
|
30
|
+
0.0
|
|
31
|
+
"""
|
|
32
|
+
|
|
16
33
|
table: defaultdict[ValueType, defaultdict[ValueType, float]] = defaultdict(
|
|
17
34
|
lambda: defaultdict(lambda: default)
|
|
18
35
|
)
|
|
@@ -30,6 +47,16 @@ def table(
|
|
|
30
47
|
|
|
31
48
|
|
|
32
49
|
def equality() -> SimPairFunc[Any, float]:
|
|
50
|
+
"""Equality similarity function. Returns 1.0 if the two values are equal, 0.0 otherwise.
|
|
51
|
+
|
|
52
|
+
Examples:
|
|
53
|
+
>>> sim = equality()
|
|
54
|
+
>>> sim("b", "a")
|
|
55
|
+
0.0
|
|
56
|
+
>>> sim("a", "a")
|
|
57
|
+
1.0
|
|
58
|
+
"""
|
|
59
|
+
|
|
33
60
|
def wrapped_func(x: Any, y: Any) -> float:
|
|
34
61
|
return 1.0 if x == y else 0.0
|
|
35
62
|
|
|
@@ -15,6 +15,9 @@ def linear(max: float, min: float = 0.0) -> SimPairFunc[Number, float]:
|
|
|
15
15
|
min: Minimum bound of the interval
|
|
16
16
|
|
|
17
17
|

|
|
18
|
+
>>> sim = linear(100)
|
|
19
|
+
>>> sim(50, 60)
|
|
20
|
+
0.9
|
|
18
21
|
"""
|
|
19
22
|
|
|
20
23
|
def wrapped_func(x: Number, y: Number) -> float:
|
|
@@ -37,6 +40,12 @@ def threshold(threshold: float) -> SimPairFunc[Number, float]:
|
|
|
37
40
|
threshold: If the absolute difference between the two values is less than or equal to this value, the similarity is 1.0, otherwise it is 0.0
|
|
38
41
|
|
|
39
42
|

|
|
43
|
+
Examples:
|
|
44
|
+
>>> sim = threshold(10)
|
|
45
|
+
>>> sim(50, 60)
|
|
46
|
+
1.0
|
|
47
|
+
>>> sim(50, 61)
|
|
48
|
+
0.0
|
|
40
49
|
"""
|
|
41
50
|
|
|
42
51
|
def wrapped_func(x: Number, y: Number) -> float:
|
|
@@ -49,9 +58,13 @@ def exponential(alpha: float = 1.0) -> SimPairFunc[Number, float]:
|
|
|
49
58
|
"""Exponential similarity function.
|
|
50
59
|
|
|
51
60
|
Args:
|
|
52
|
-
alpha: Controls the growth of the exponential function for the similarity. The larger alpha is, the faster the
|
|
61
|
+
alpha: Controls the growth of the exponential function for the similarity. The larger alpha is, the faster the similarity decreases.
|
|
53
62
|
|
|
54
63
|

|
|
64
|
+
Examples:
|
|
65
|
+
>>> sim = exponential(0.1)
|
|
66
|
+
>>> sim(50, 60)
|
|
67
|
+
0.36787944117144233
|
|
55
68
|
"""
|
|
56
69
|
|
|
57
70
|
def wrapped_func(x: Number, y: Number) -> float:
|
|
@@ -68,6 +81,12 @@ def sigmoid(alpha: float = 1.0, theta: float = 1.0) -> SimPairFunc[Number, float
|
|
|
68
81
|
theta: Specifies the point at which the similarity value is 0.5.
|
|
69
82
|
|
|
70
83
|

|
|
84
|
+
Examples:
|
|
85
|
+
>>> sim = sigmoid(1, 10)
|
|
86
|
+
>>> sim(50, 60)
|
|
87
|
+
0.5
|
|
88
|
+
>>> sim(50, 58)
|
|
89
|
+
0.8807970779778823
|
|
71
90
|
"""
|
|
72
91
|
|
|
73
92
|
def wrapped_func(x: Number, y: Number) -> float:
|
|
@@ -12,8 +12,24 @@ from cbrkit.typing import (
|
|
|
12
12
|
SimSeqFunc,
|
|
13
13
|
)
|
|
14
14
|
|
|
15
|
+
__all__ = [
|
|
16
|
+
"spacy",
|
|
17
|
+
"sentence_transformers",
|
|
18
|
+
"openai",
|
|
19
|
+
"levenshtein",
|
|
20
|
+
"jaro",
|
|
21
|
+
"jaro_winkler",
|
|
22
|
+
"table",
|
|
23
|
+
]
|
|
24
|
+
|
|
15
25
|
|
|
16
26
|
def _cosine(u, v) -> float:
|
|
27
|
+
"""Cosine similarity between two vectors
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
u: First vector
|
|
31
|
+
v: Second vector
|
|
32
|
+
"""
|
|
17
33
|
import numpy as np
|
|
18
34
|
import scipy.spatial.distance as scipy_dist
|
|
19
35
|
|
|
@@ -28,6 +44,11 @@ def _unique_items(pairs: Sequence[tuple[str, str]]) -> list[str]:
|
|
|
28
44
|
|
|
29
45
|
|
|
30
46
|
def spacy(model_name: str = "en_core_web_lg") -> SimSeqFunc[str, float]:
|
|
47
|
+
"""[spaCy](https://spacy.io/usage/linguistic-features/#vectors-similarity) based semantic similarity using word vectors. It calculates the similarity between given text pairs.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
model_name: Name of the [spaCy model](https://spacy.io/usage/models) to use to generate word vectors. Defaults to "en_core_web_lg".
|
|
51
|
+
"""
|
|
31
52
|
from spacy import load as spacy_load
|
|
32
53
|
|
|
33
54
|
nlp = spacy_load(model_name)
|
|
@@ -46,6 +67,11 @@ def spacy(model_name: str = "en_core_web_lg") -> SimSeqFunc[str, float]:
|
|
|
46
67
|
|
|
47
68
|
|
|
48
69
|
def sentence_transformers(model_name: str) -> SimSeqFunc[str, float]:
|
|
70
|
+
"""[Sentence-Transformers](https://www.sbert.net/) based semantic similarity using word vectors. It calculates the similarity between given text pairs.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
model_name: Name of the [pretrained model](https://www.sbert.net/docs/pretrained_models.html) to use to generate word vectors. It calculates the cosine similarity between given text pairs.
|
|
74
|
+
"""
|
|
49
75
|
from sentence_transformers import SentenceTransformer
|
|
50
76
|
|
|
51
77
|
model = SentenceTransformer(model_name)
|
|
@@ -61,6 +87,11 @@ def sentence_transformers(model_name: str) -> SimSeqFunc[str, float]:
|
|
|
61
87
|
|
|
62
88
|
|
|
63
89
|
def openai(model_name: str) -> SimSeqFunc[str, float]:
|
|
90
|
+
"""Semantic similarity using word vectors generated by one of OpenAI's embedding models. It calculates the cosine similarity between given text pairs.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
model_name: Name of the [embedding model](https://platform.openai.com/docs/models/embeddings) to use to generate word vectors.
|
|
94
|
+
"""
|
|
64
95
|
import numpy as np
|
|
65
96
|
from openai import Client
|
|
66
97
|
|
|
@@ -78,6 +109,18 @@ def openai(model_name: str) -> SimSeqFunc[str, float]:
|
|
|
78
109
|
|
|
79
110
|
|
|
80
111
|
def levenshtein(score_cutoff: float | None = None) -> SimPairFunc[str, float]:
|
|
112
|
+
"""Similarity function that calculates a normalized indel similarity between two strings based on [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance).
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
score_cutoff: If the similarity is less than this value, the function will return 0.0.
|
|
116
|
+
Examples:
|
|
117
|
+
>>> sim = levenshtein()
|
|
118
|
+
>>> sim("kitten", "sitting")
|
|
119
|
+
0.6153846153846154
|
|
120
|
+
>>> sim = levenshtein(score_cutoff=0.8)
|
|
121
|
+
>>> sim("kitten", "sitting")
|
|
122
|
+
0.0
|
|
123
|
+
"""
|
|
81
124
|
import Levenshtein
|
|
82
125
|
|
|
83
126
|
def wrapped_func(x: str, y: str) -> float:
|
|
@@ -87,6 +130,18 @@ def levenshtein(score_cutoff: float | None = None) -> SimPairFunc[str, float]:
|
|
|
87
130
|
|
|
88
131
|
|
|
89
132
|
def jaro(score_cutoff: float | None = None) -> SimPairFunc[str, float]:
|
|
133
|
+
"""Jaro similarity function to compute similarity between two strings.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
score_cutoff: If the similarity is less than this value, the function will return 0.0.
|
|
137
|
+
Examples:
|
|
138
|
+
>>> sim = jaro()
|
|
139
|
+
>>> sim("kitten", "sitting")
|
|
140
|
+
0.746031746031746
|
|
141
|
+
>>> sim = jaro(score_cutoff=0.8)
|
|
142
|
+
>>> sim("kitten", "sitting")
|
|
143
|
+
0.0
|
|
144
|
+
"""
|
|
90
145
|
import Levenshtein
|
|
91
146
|
|
|
92
147
|
def wrapped_func(x: str, y: str) -> float:
|
|
@@ -96,8 +151,21 @@ def jaro(score_cutoff: float | None = None) -> SimPairFunc[str, float]:
|
|
|
96
151
|
|
|
97
152
|
|
|
98
153
|
def jaro_winkler(
|
|
99
|
-
score_cutoff: float | None = None, prefix_weight: float
|
|
154
|
+
score_cutoff: float | None = None, prefix_weight: float = 0.1
|
|
100
155
|
) -> SimPairFunc[str, float]:
|
|
156
|
+
"""Jaro-Winkler similarity function to compute similarity between two strings.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
score_cutoff: If the similarity is less than this value, the function will return 0.0.
|
|
160
|
+
prefix_weight: Weight used for the common prefix of the two strings. Has to be between 0 and 0.25. Default is 0.1.
|
|
161
|
+
Examples:
|
|
162
|
+
>>> sim = jaro_winkler()
|
|
163
|
+
>>> sim("kitten", "sitting")
|
|
164
|
+
0.746031746031746
|
|
165
|
+
>>> sim = jaro_winkler(score_cutoff=0.8)
|
|
166
|
+
>>> sim("kitten", "sitting")
|
|
167
|
+
0.0
|
|
168
|
+
"""
|
|
101
169
|
import Levenshtein
|
|
102
170
|
|
|
103
171
|
def wrapped_func(x: str, y: str) -> float:
|
|
@@ -113,6 +181,20 @@ def table(
|
|
|
113
181
|
symmetric: bool = True,
|
|
114
182
|
default: float = 0.0,
|
|
115
183
|
) -> SimPairFunc[str, float]:
|
|
184
|
+
"""Allows to import a similarity values from a table.
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
entries: Sequence[tuple[a, b, sim(a, b)]
|
|
188
|
+
symmetric: If True, the table is assumed to be symmetric, i.e. sim(a, b) = sim(b, a)
|
|
189
|
+
default: Default similarity value for pairs not in the table
|
|
190
|
+
|
|
191
|
+
Examples:
|
|
192
|
+
>>> sim = table([("a", "b", 0.5), ("b", "c", 0.7)], symmetric=True, default=0.0)
|
|
193
|
+
>>> sim("b", "a")
|
|
194
|
+
0.5
|
|
195
|
+
>>> sim("a", "c")
|
|
196
|
+
0.0
|
|
197
|
+
"""
|
|
116
198
|
if isinstance(entries, FilePath):
|
|
117
199
|
if isinstance(entries, str):
|
|
118
200
|
entries = Path(entries)
|
|
@@ -4,6 +4,8 @@ from typing import Optional, Protocol, TypedDict, cast
|
|
|
4
4
|
from cbrkit.loaders import data as load_data
|
|
5
5
|
from cbrkit.typing import FilePath, SimPairFunc
|
|
6
6
|
|
|
7
|
+
__all__ = ["Taxonomy", "TaxonomyNode", "TaxonomyFunc", "load", "wu_palmer"]
|
|
8
|
+
|
|
7
9
|
|
|
8
10
|
class SerializedNode(TypedDict, total=False):
|
|
9
11
|
key: str
|
|
@@ -76,6 +78,15 @@ class TaxonomyFunc(Protocol):
|
|
|
76
78
|
|
|
77
79
|
|
|
78
80
|
def wu_palmer() -> TaxonomyFunc:
|
|
81
|
+
"""Wu & Palmer similarity measure of two nodes in a taxonomy.
|
|
82
|
+
>>> taxonomy = Taxonomy("./data/cars-taxonomy.yaml")
|
|
83
|
+
>>> sim = wu_palmer()
|
|
84
|
+
>>> sim(taxonomy, "audi", "porsche")
|
|
85
|
+
0.5
|
|
86
|
+
>>> sim(taxonomy, "audi", "bmw")
|
|
87
|
+
0.0
|
|
88
|
+
"""
|
|
89
|
+
|
|
79
90
|
def wrapped_func(taxonomy: Taxonomy, x: str, y: str) -> float:
|
|
80
91
|
node1 = taxonomy.nodes[x]
|
|
81
92
|
node2 = taxonomy.nodes[y]
|
|
@@ -92,6 +103,13 @@ _taxonomy_func = wu_palmer()
|
|
|
92
103
|
def load(
|
|
93
104
|
path: FilePath, measure: TaxonomyFunc = _taxonomy_func
|
|
94
105
|
) -> SimPairFunc[str, float]:
|
|
106
|
+
"""Load a taxonomy and return a function that measures the similarity.
|
|
107
|
+
>>> sim = load("./data/cars-taxonomy.yaml", measure=wu_palmer())
|
|
108
|
+
>>> sim("audi", "porsche")
|
|
109
|
+
0.5
|
|
110
|
+
>>> sim("audi", "bmw")
|
|
111
|
+
0.0
|
|
112
|
+
"""
|
|
95
113
|
taxonomy = Taxonomy(path)
|
|
96
114
|
|
|
97
115
|
def wrapped_func(x: str, y: str) -> float:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "cbrkit"
|
|
3
|
-
version = "0.3.
|
|
3
|
+
version = "0.3.2"
|
|
4
4
|
description = "Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI."
|
|
5
5
|
authors = ["Mirko Lenz <mirko@mirkolenz.com>"]
|
|
6
6
|
license = "MIT"
|
|
@@ -40,53 +40,47 @@ cbrkit = "cbrkit.cli:app"
|
|
|
40
40
|
|
|
41
41
|
[tool.poetry.dependencies]
|
|
42
42
|
python = ">=3.11, <3.13"
|
|
43
|
+
fastapi = { version = ">=0.100, <1.0", optional = true, extras = ["all"] }
|
|
44
|
+
levenshtein = { version = ">=0.23, <1.0", optional = true }
|
|
45
|
+
nltk = { version = "^3.8", optional = true }
|
|
46
|
+
openai = { version = "^1.5", optional = true }
|
|
47
|
+
orjson = "^3.9"
|
|
43
48
|
pandas = "^2.1"
|
|
49
|
+
pyarrow = ">=13.0"
|
|
44
50
|
pyyaml = "^6.0"
|
|
45
|
-
orjson = "^3.9"
|
|
46
|
-
xmltodict = "^0.13"
|
|
47
|
-
pyarrow = "^14.0"
|
|
48
|
-
typer = { version = "^0.9", extras = ["all"], optional = true }
|
|
49
|
-
fastapi = { version = "^0.104", optional = true, extras = ["all"] }
|
|
50
|
-
uvicorn = { version = "^0.24", optional = true, extras = ["standard"] }
|
|
51
|
-
spacy = { version = "^3.7", optional = true }
|
|
52
|
-
nltk = { version = "^3.8", optional = true }
|
|
53
|
-
levenshtein = { version = "^0.23", optional = true }
|
|
54
51
|
sentence-transformers = { version = "^2.2", optional = true }
|
|
55
|
-
|
|
52
|
+
spacy = { version = "^3.7", optional = true }
|
|
56
53
|
torch = { version = "^2.1.1", optional = true }
|
|
57
|
-
transformers = { version = "^4.
|
|
54
|
+
transformers = { version = "^4.35", optional = true }
|
|
55
|
+
typer = { version = "^0.9", extras = ["all"], optional = true }
|
|
56
|
+
uvicorn = { version = ">=0.24, <1.0", optional = true, extras = ["standard"] }
|
|
57
|
+
xmltodict = "^0.13"
|
|
58
58
|
|
|
59
59
|
[tool.poetry.group.dev.dependencies]
|
|
60
|
-
pytest = "^
|
|
60
|
+
pytest = "^8.0.0"
|
|
61
61
|
pytest-cov = "^4.1"
|
|
62
62
|
|
|
63
63
|
[tool.poetry.group.docs.dependencies]
|
|
64
|
-
pdoc = "^14.
|
|
64
|
+
pdoc = "^14.4"
|
|
65
65
|
|
|
66
66
|
[tool.poetry.extras]
|
|
67
67
|
all = [
|
|
68
|
-
"typer",
|
|
69
68
|
"fastapi",
|
|
70
|
-
"uvicorn",
|
|
71
|
-
"spacy",
|
|
72
|
-
"nltk",
|
|
73
69
|
"levenshtein",
|
|
74
|
-
"
|
|
70
|
+
"nltk",
|
|
75
71
|
"openai",
|
|
72
|
+
"sentence-transformers",
|
|
73
|
+
"spacy",
|
|
74
|
+
"spacy",
|
|
76
75
|
"torch",
|
|
77
76
|
"transformers",
|
|
77
|
+
"typer",
|
|
78
|
+
"uvicorn",
|
|
78
79
|
]
|
|
79
80
|
cli = ["typer"]
|
|
80
81
|
api = ["fastapi", "uvicorn"]
|
|
81
|
-
nlp = [
|
|
82
|
-
|
|
83
|
-
"nltk",
|
|
84
|
-
"levenshtein",
|
|
85
|
-
"sentence-transformers",
|
|
86
|
-
"openai",
|
|
87
|
-
"torch",
|
|
88
|
-
"transformers",
|
|
89
|
-
]
|
|
82
|
+
nlp = ["levenshtein", "nltk", "openai", "spacy"]
|
|
83
|
+
transformers = ["sentence-transformers", "torch", "transformers"]
|
|
90
84
|
|
|
91
85
|
[tool.pytest.ini_options]
|
|
92
86
|
addopts = "--cov cbrkit --cov-report term-missing --doctest-modules --ignore cbrkit/cli.py --ignore cbrkit/api.py --ignore result"
|
cbrkit-0.3.0/README.md
DELETED
|
@@ -1,57 +0,0 @@
|
|
|
1
|
-
<!-- markdownlint-disable MD033 MD041 -->
|
|
2
|
-
<h2><p align="center">CBRkit</p></h2>
|
|
3
|
-
<p align="center">
|
|
4
|
-
<img width="256px" alt="cbrkit logo" src="https://raw.githubusercontent.com/wi2trier/cbrkit/main/assets/logo.png" />
|
|
5
|
-
</p>
|
|
6
|
-
<p align="center">
|
|
7
|
-
<a href="https://pypi.org/project/cbrkit/">PyPI</a> |
|
|
8
|
-
<a href="https://wi2trier.github.io/cbrkit/">Docs</a> |
|
|
9
|
-
<a href="https://github.com/wi2trier/cbrkit/tree/main/tests/test_retrieve.py">Example</a>
|
|
10
|
-
</p>
|
|
11
|
-
<p align="center">
|
|
12
|
-
Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI.
|
|
13
|
-
</p>
|
|
14
|
-
|
|
15
|
-
---
|
|
16
|
-
|
|
17
|
-
# CBRkit
|
|
18
|
-
|
|
19
|
-
> [!caution]
|
|
20
|
-
> The project is under active development and does not yet adhere to semantic versioning.
|
|
21
|
-
> Breaking changes may occur at any time for versions `0.x.y`.
|
|
22
|
-
> Once the project reaches version `1.0`, semantic versioning will be applied.
|
|
23
|
-
|
|
24
|
-
## Installation
|
|
25
|
-
|
|
26
|
-
The library is available on [PyPI](https://pypi.org/project/cbrkit/), so you can install it with `pip`:
|
|
27
|
-
|
|
28
|
-
```shell
|
|
29
|
-
pip install cbrkit
|
|
30
|
-
```
|
|
31
|
-
|
|
32
|
-
It comes with several optional dependencies for certain tasks like NLP which can be installed with:
|
|
33
|
-
|
|
34
|
-
```shell
|
|
35
|
-
pip install cbrkit[EXTRA_NAME,...]
|
|
36
|
-
```
|
|
37
|
-
|
|
38
|
-
where `EXTRA_NAME` is one of the following:
|
|
39
|
-
|
|
40
|
-
- `nlp`: Natural Language Processing (NLP), including `spacy`, `openai`, and `sentence-transformers`
|
|
41
|
-
- `cli`: Command Line Interface (CLI)
|
|
42
|
-
- `api`: REST API Server
|
|
43
|
-
- `all`: All of the above
|
|
44
|
-
|
|
45
|
-
## Usage
|
|
46
|
-
|
|
47
|
-
CBRkit allows the definition of similarity metrics through _composition_.
|
|
48
|
-
This means that you can easily build even complex similarities by mixing built-in and/or custom measures.
|
|
49
|
-
CBRkit also includes predefined aggregation functions.
|
|
50
|
-
A working retrieval example can be found as part of our [testing suite](https://github.com/wi2trier/cbrkit/tree/main/tests/test_retrieve.py).
|
|
51
|
-
|
|
52
|
-
The following modules are part of CBRkit:
|
|
53
|
-
|
|
54
|
-
- `sim`: Similarity generator functions for various data types (e.g., strings, numbers).
|
|
55
|
-
- `global_sim`: Similarity generator functions for aggregating the above ones.
|
|
56
|
-
- `retrieval`: Functions for retrieving cases based on a query.
|
|
57
|
-
- `typing`: Generic type definitions for defining custom functions.
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
from . import collections, generic, numeric, strings, taxonomy
|
|
2
|
-
from ._helpers import AbstractFloat, dist2sim, sim2map, sim2seq, unpack_sim, unpack_sims
|
|
3
|
-
|
|
4
|
-
__all__ = [
|
|
5
|
-
"collections",
|
|
6
|
-
"generic",
|
|
7
|
-
"numeric",
|
|
8
|
-
"strings",
|
|
9
|
-
"taxonomy",
|
|
10
|
-
"dist2sim",
|
|
11
|
-
"sim2map",
|
|
12
|
-
"sim2seq",
|
|
13
|
-
"unpack_sim",
|
|
14
|
-
"unpack_sims",
|
|
15
|
-
"AbstractFloat",
|
|
16
|
-
]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|