cbrkit 0.6.2__tar.gz → 0.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cbrkit-0.6.2 → cbrkit-0.8.0}/PKG-INFO +5 -2
- {cbrkit-0.6.2 → cbrkit-0.8.0}/cbrkit/__init__.py +0 -1
- {cbrkit-0.6.2 → cbrkit-0.8.0}/cbrkit/cli.py +0 -1
- cbrkit-0.8.0/cbrkit/sim/collections.py +91 -0
- {cbrkit-0.6.2 → cbrkit-0.8.0}/cbrkit/sim/strings/__init__.py +77 -1
- {cbrkit-0.6.2 → cbrkit-0.8.0}/cbrkit/typing.py +5 -10
- {cbrkit-0.6.2 → cbrkit-0.8.0}/pyproject.toml +7 -2
- cbrkit-0.6.2/cbrkit/sim/collections.py +0 -28
- {cbrkit-0.6.2 → cbrkit-0.8.0}/LICENSE +0 -0
- {cbrkit-0.6.2 → cbrkit-0.8.0}/README.md +0 -0
- {cbrkit-0.6.2 → cbrkit-0.8.0}/cbrkit/__main__.py +0 -0
- {cbrkit-0.6.2 → cbrkit-0.8.0}/cbrkit/api.py +0 -0
- {cbrkit-0.6.2 → cbrkit-0.8.0}/cbrkit/helpers.py +0 -0
- {cbrkit-0.6.2 → cbrkit-0.8.0}/cbrkit/loaders.py +0 -0
- {cbrkit-0.6.2 → cbrkit-0.8.0}/cbrkit/py.typed +0 -0
- {cbrkit-0.6.2 → cbrkit-0.8.0}/cbrkit/retrieval.py +0 -0
- {cbrkit-0.6.2 → cbrkit-0.8.0}/cbrkit/sim/__init__.py +0 -0
- {cbrkit-0.6.2 → cbrkit-0.8.0}/cbrkit/sim/_aggregator.py +0 -0
- {cbrkit-0.6.2 → cbrkit-0.8.0}/cbrkit/sim/_attribute_value.py +0 -0
- {cbrkit-0.6.2 → cbrkit-0.8.0}/cbrkit/sim/generic.py +0 -0
- {cbrkit-0.6.2 → cbrkit-0.8.0}/cbrkit/sim/graph/__init__.py +0 -0
- {cbrkit-0.6.2 → cbrkit-0.8.0}/cbrkit/sim/graph/_astar.py +0 -0
- {cbrkit-0.6.2 → cbrkit-0.8.0}/cbrkit/sim/graph/_model.py +0 -0
- {cbrkit-0.6.2 → cbrkit-0.8.0}/cbrkit/sim/numbers.py +0 -0
- {cbrkit-0.6.2 → cbrkit-0.8.0}/cbrkit/sim/strings/taxonomy.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: cbrkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.0
|
|
4
4
|
Summary: Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI.
|
|
5
5
|
Home-page: https://wi2trier.github.io/cbrkit/
|
|
6
6
|
License: MIT
|
|
@@ -28,9 +28,12 @@ Provides-Extra: all
|
|
|
28
28
|
Provides-Extra: api
|
|
29
29
|
Provides-Extra: cli
|
|
30
30
|
Provides-Extra: nlp
|
|
31
|
+
Provides-Extra: timeseries
|
|
31
32
|
Provides-Extra: transformers
|
|
33
|
+
Requires-Dist: dtaidistance (>=2.3.11,<3.0.0) ; extra == "all" or extra == "timeseries"
|
|
32
34
|
Requires-Dist: fastapi[all] (>=0.100,<1.0) ; extra == "all" or extra == "api"
|
|
33
35
|
Requires-Dist: levenshtein (>=0.23,<1.0) ; extra == "all" or extra == "nlp"
|
|
36
|
+
Requires-Dist: minineedle (>=3.1.5,<4.0.0) ; extra == "all" or extra == "timeseries"
|
|
34
37
|
Requires-Dist: nltk (>=3.8,<4.0) ; extra == "all" or extra == "nlp"
|
|
35
38
|
Requires-Dist: openai (>=1.5,<2.0) ; extra == "all" or extra == "nlp"
|
|
36
39
|
Requires-Dist: orjson (>=3.9,<4.0)
|
|
@@ -38,7 +41,7 @@ Requires-Dist: pandas (>=2.1,<3.0)
|
|
|
38
41
|
Requires-Dist: pyarrow (>=13.0)
|
|
39
42
|
Requires-Dist: pyyaml (>=6.0,<7.0)
|
|
40
43
|
Requires-Dist: sentence-transformers (>=2.2,<3.0) ; extra == "all" or extra == "transformers"
|
|
41
|
-
Requires-Dist: spacy (>=3.7,<4.0) ; extra == "all" or extra == "
|
|
44
|
+
Requires-Dist: spacy (>=3.7,<4.0) ; extra == "all" or extra == "nlp"
|
|
42
45
|
Requires-Dist: torch (>=2.1.1,<3.0.0) ; extra == "all" or extra == "transformers"
|
|
43
46
|
Requires-Dist: transformers (>=4.35,<5.0) ; extra == "all" or extra == "transformers"
|
|
44
47
|
Requires-Dist: typer[all] (>=0.9,<1.0) ; extra == "all" or extra == "cli"
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
from collections.abc import Collection, Sequence, Set
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from cbrkit.helpers import dist2sim
|
|
5
|
+
from cbrkit.typing import SimPairFunc
|
|
6
|
+
|
|
7
|
+
Number = float | int
|
|
8
|
+
|
|
9
|
+
__all__ = ["jaccard", "smith_waterman", "dtw"]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def jaccard() -> SimPairFunc[Collection[Any], float]:
|
|
13
|
+
"""Jaccard similarity function.
|
|
14
|
+
|
|
15
|
+
Examples:
|
|
16
|
+
>>> sim = jaccard()
|
|
17
|
+
>>> sim(["a", "b", "c", "d"], ["a", "b", "c"])
|
|
18
|
+
0.8
|
|
19
|
+
"""
|
|
20
|
+
from nltk.metrics import jaccard_distance
|
|
21
|
+
|
|
22
|
+
def wrapped_func(x: Collection[Any], y: Collection[Any]) -> float:
|
|
23
|
+
if not isinstance(x, Set):
|
|
24
|
+
x = set(x)
|
|
25
|
+
if not isinstance(y, Set):
|
|
26
|
+
y = set(y)
|
|
27
|
+
|
|
28
|
+
return dist2sim(jaccard_distance(x, y))
|
|
29
|
+
|
|
30
|
+
return wrapped_func
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def smith_waterman(
|
|
34
|
+
match_score: int = 2, mismatch_penalty: int = -1, gap_penalty: int = -1
|
|
35
|
+
) -> SimPairFunc[Sequence[Any], float]:
|
|
36
|
+
"""
|
|
37
|
+
Performs the Smith-Waterman alignment with configurable scoring parameters. If no element matches it returns 0.0.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
match_score: Score for matching characters. Defaults to 2.
|
|
41
|
+
mismatch_penalty: Penalty for mismatching characters. Defaults to -1.
|
|
42
|
+
gap_penalty: Penalty for gaps. Defaults to -1.
|
|
43
|
+
|
|
44
|
+
Example:
|
|
45
|
+
>>> sim = smith_waterman()
|
|
46
|
+
>>> sim("abcde", "fghe")
|
|
47
|
+
2
|
|
48
|
+
"""
|
|
49
|
+
from minineedle import core, smith
|
|
50
|
+
|
|
51
|
+
def wrapped_func(x: Sequence[Any], y: Sequence[Any]) -> float:
|
|
52
|
+
try:
|
|
53
|
+
alignment = smith.SmithWaterman(x, y)
|
|
54
|
+
alignment.change_matrix(
|
|
55
|
+
core.ScoreMatrix(
|
|
56
|
+
match=match_score, miss=mismatch_penalty, gap=gap_penalty
|
|
57
|
+
)
|
|
58
|
+
)
|
|
59
|
+
alignment.align()
|
|
60
|
+
|
|
61
|
+
return alignment.get_score()
|
|
62
|
+
except ZeroDivisionError:
|
|
63
|
+
return 0.0
|
|
64
|
+
|
|
65
|
+
return wrapped_func
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def dtw() -> SimPairFunc[Collection[int], float]:
|
|
69
|
+
"""Dynamic Time Warping similarity function.
|
|
70
|
+
|
|
71
|
+
Examples:
|
|
72
|
+
>>> sim = dtw()
|
|
73
|
+
>>> sim([1, 2, 3], [1, 2, 3, 4])
|
|
74
|
+
0.5
|
|
75
|
+
"""
|
|
76
|
+
import numpy as np
|
|
77
|
+
from dtaidistance import dtw
|
|
78
|
+
|
|
79
|
+
def wrapped_func(
|
|
80
|
+
x: Collection[Number] | np.ndarray, y: Collection[Number] | np.ndarray
|
|
81
|
+
) -> float:
|
|
82
|
+
if not isinstance(x, np.ndarray):
|
|
83
|
+
x = np.array(x)
|
|
84
|
+
if not isinstance(y, np.ndarray):
|
|
85
|
+
y = np.array(y)
|
|
86
|
+
|
|
87
|
+
distance = dtw.distance(x, y)
|
|
88
|
+
|
|
89
|
+
return dist2sim(distance)
|
|
90
|
+
|
|
91
|
+
return wrapped_func
|
|
@@ -3,8 +3,10 @@
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
import csv
|
|
6
|
+
import fnmatch
|
|
6
7
|
import itertools
|
|
7
|
-
|
|
8
|
+
import re
|
|
9
|
+
from collections.abc import Callable, Sequence
|
|
8
10
|
from pathlib import Path
|
|
9
11
|
from typing import cast
|
|
10
12
|
|
|
@@ -187,6 +189,80 @@ def jaro_winkler(
|
|
|
187
189
|
return wrapped_func
|
|
188
190
|
|
|
189
191
|
|
|
192
|
+
def ngram(
|
|
193
|
+
n: int,
|
|
194
|
+
case_sensitive: bool = False,
|
|
195
|
+
tokenizer: Callable[[str], Sequence[str]] | None = None,
|
|
196
|
+
) -> SimPairFunc[str, float]:
|
|
197
|
+
"""N-gram similarity function to compute [similarity](https://procake.pages.gitlab.rlp.net/procake-wiki/sim/strings/#n-gram) between two strings.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
n: Length of the n-gram
|
|
201
|
+
case_sensitive: If True, the comparison is case-sensitive
|
|
202
|
+
tokenizer: Tokenizer function to split the input strings into tokens. If None, the input strings are split into characters.
|
|
203
|
+
Examples:
|
|
204
|
+
>>> sim = ngram(3, case_sensitive=False)
|
|
205
|
+
>>> sim("kitten", "sitting")
|
|
206
|
+
0.125
|
|
207
|
+
|
|
208
|
+
"""
|
|
209
|
+
from nltk.util import ngrams
|
|
210
|
+
|
|
211
|
+
def wrapped_func(x: str, y: str) -> float:
|
|
212
|
+
if not case_sensitive:
|
|
213
|
+
x = x.lower()
|
|
214
|
+
y = y.lower()
|
|
215
|
+
|
|
216
|
+
x_items = tokenizer(x) if tokenizer is not None else list(x)
|
|
217
|
+
y_items = tokenizer(y) if tokenizer is not None else list(y)
|
|
218
|
+
|
|
219
|
+
x_ngrams = set(ngrams(x_items, n))
|
|
220
|
+
y_ngrams = set(ngrams(y_items, n))
|
|
221
|
+
|
|
222
|
+
return len(x_ngrams.intersection(y_ngrams)) / len(x_ngrams.union(y_ngrams))
|
|
223
|
+
|
|
224
|
+
return wrapped_func
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def regex() -> SimPairFunc[str, float]:
|
|
228
|
+
"""Compares a case x to a query y, written as a regular expression. If the case matches the query, the similarity is 1.0, otherwise 0.0.
|
|
229
|
+
|
|
230
|
+
Examples:
|
|
231
|
+
>>> sim = regex()
|
|
232
|
+
>>> sim("Test1", "T.st[0-9]")
|
|
233
|
+
1.0
|
|
234
|
+
>>> sim("Test2", "T.st[3-6]")
|
|
235
|
+
0.0
|
|
236
|
+
"""
|
|
237
|
+
|
|
238
|
+
def wrapped_func(x: str, y: str) -> float:
|
|
239
|
+
regex = re.compile(y)
|
|
240
|
+
return 1.0 if regex.match(x) else 0.0
|
|
241
|
+
|
|
242
|
+
return wrapped_func
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def glob(case_sensitive: bool = False) -> SimPairFunc[str, float]:
|
|
246
|
+
"""Compares a case x to a query y, written as a glob pattern, which can contain wildcards. If the case matches the query, the similarity is 1.0, otherwise 0.0.
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
case_sensitive: If True, the comparison is case-sensitive
|
|
250
|
+
Examples:
|
|
251
|
+
>>> sim = glob()
|
|
252
|
+
>>> sim("Test1", "Test?")
|
|
253
|
+
1.0
|
|
254
|
+
>>> sim("Test2", "Test[3-9]")
|
|
255
|
+
0.0
|
|
256
|
+
"""
|
|
257
|
+
|
|
258
|
+
comparison_func = fnmatch.fnmatchcase if case_sensitive else fnmatch.fnmatch
|
|
259
|
+
|
|
260
|
+
def wrapped_func(x: str, y: str) -> float:
|
|
261
|
+
return 1.0 if comparison_func(x, y) else 0.0
|
|
262
|
+
|
|
263
|
+
return wrapped_func
|
|
264
|
+
|
|
265
|
+
|
|
190
266
|
def table(
|
|
191
267
|
entries: Sequence[tuple[str, str, float]] | FilePath,
|
|
192
268
|
symmetric: bool = True,
|
|
@@ -32,20 +32,17 @@ SimSeqOrMap = SimMap[KeyType, SimType] | SimSeq[SimType]
|
|
|
32
32
|
class SimMapFunc(Protocol[KeyType, ValueType_contra, SimType_cov]):
|
|
33
33
|
def __call__(
|
|
34
34
|
self, x_map: Mapping[KeyType, ValueType_contra], y: ValueType_contra
|
|
35
|
-
) -> SimMap[KeyType, SimType_cov]:
|
|
36
|
-
...
|
|
35
|
+
) -> SimMap[KeyType, SimType_cov]: ...
|
|
37
36
|
|
|
38
37
|
|
|
39
38
|
class SimSeqFunc(Protocol[ValueType_contra, SimType_cov]):
|
|
40
39
|
def __call__(
|
|
41
40
|
self, pairs: Sequence[tuple[ValueType_contra, ValueType_contra]], /
|
|
42
|
-
) -> SimSeq[SimType_cov]:
|
|
43
|
-
...
|
|
41
|
+
) -> SimSeq[SimType_cov]: ...
|
|
44
42
|
|
|
45
43
|
|
|
46
44
|
class SimPairFunc(Protocol[ValueType_contra, SimType_cov]):
|
|
47
|
-
def __call__(self, x: ValueType_contra, y: ValueType_contra, /) -> SimType_cov:
|
|
48
|
-
...
|
|
45
|
+
def __call__(self, x: ValueType_contra, y: ValueType_contra, /) -> SimType_cov: ...
|
|
49
46
|
|
|
50
47
|
|
|
51
48
|
AnySimFunc = (
|
|
@@ -60,8 +57,7 @@ class AggregatorFunc(Protocol[KeyType, SimType_contra]):
|
|
|
60
57
|
self,
|
|
61
58
|
similarities: SimSeqOrMap[KeyType, SimType_contra],
|
|
62
59
|
/,
|
|
63
|
-
) -> float:
|
|
64
|
-
...
|
|
60
|
+
) -> float: ...
|
|
65
61
|
|
|
66
62
|
|
|
67
63
|
class PoolingFunc(Protocol):
|
|
@@ -69,5 +65,4 @@ class PoolingFunc(Protocol):
|
|
|
69
65
|
self,
|
|
70
66
|
similarities: SimSeq[float],
|
|
71
67
|
/,
|
|
72
|
-
) -> float:
|
|
73
|
-
...
|
|
68
|
+
) -> float: ...
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "cbrkit"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.8.0"
|
|
4
4
|
description = "Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI."
|
|
5
5
|
authors = ["Mirko Lenz <mirko@mirkolenz.com>"]
|
|
6
6
|
license = "MIT"
|
|
@@ -54,11 +54,14 @@ torch = { version = "^2.1.1", optional = true }
|
|
|
54
54
|
transformers = { version = "^4.35", optional = true }
|
|
55
55
|
typer = { version = ">=0.9, <1.0", extras = ["all"], optional = true }
|
|
56
56
|
uvicorn = { version = ">=0.24, <1.0", optional = true, extras = ["standard"] }
|
|
57
|
+
dtaidistance = { version = "^2.3.11", optional = true }
|
|
58
|
+
minineedle ={ version = "^3.1.5", optional = true }
|
|
57
59
|
xmltodict = ">=0.13, <1.0"
|
|
58
60
|
|
|
59
61
|
[tool.poetry.group.dev.dependencies]
|
|
60
62
|
pytest = "^8.0.0"
|
|
61
63
|
pytest-cov = "^5.0.0"
|
|
64
|
+
ruff = "^0.3.4"
|
|
62
65
|
|
|
63
66
|
[tool.poetry.group.docs.dependencies]
|
|
64
67
|
pdoc = "^14.4"
|
|
@@ -71,16 +74,18 @@ all = [
|
|
|
71
74
|
"openai",
|
|
72
75
|
"sentence-transformers",
|
|
73
76
|
"spacy",
|
|
74
|
-
"spacy",
|
|
75
77
|
"torch",
|
|
76
78
|
"transformers",
|
|
77
79
|
"typer",
|
|
78
80
|
"uvicorn",
|
|
81
|
+
"minineedle",
|
|
82
|
+
"dtaidistance",
|
|
79
83
|
]
|
|
80
84
|
cli = ["typer"]
|
|
81
85
|
api = ["fastapi", "uvicorn"]
|
|
82
86
|
nlp = ["levenshtein", "nltk", "openai", "spacy"]
|
|
83
87
|
transformers = ["sentence-transformers", "torch", "transformers"]
|
|
88
|
+
timeseries = ["minineedle", "dtaidistance"]
|
|
84
89
|
|
|
85
90
|
[tool.pytest.ini_options]
|
|
86
91
|
addopts = "--cov cbrkit --cov-report term-missing --doctest-modules --ignore cbrkit/cli.py --ignore cbrkit/api.py --ignore result"
|
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
from collections.abc import Collection, Set
|
|
2
|
-
from typing import Any
|
|
3
|
-
|
|
4
|
-
from cbrkit.helpers import dist2sim
|
|
5
|
-
from cbrkit.typing import SimPairFunc
|
|
6
|
-
|
|
7
|
-
__all__ = ["jaccard"]
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def jaccard() -> SimPairFunc[Collection[Any], float]:
|
|
11
|
-
"""Jaccard similarity function.
|
|
12
|
-
|
|
13
|
-
Examples:
|
|
14
|
-
>>> sim = jaccard()
|
|
15
|
-
>>> sim(["a", "b", "c", "d"], ["a", "b", "c"])
|
|
16
|
-
0.8
|
|
17
|
-
"""
|
|
18
|
-
from nltk.metrics import jaccard_distance
|
|
19
|
-
|
|
20
|
-
def wrapped_func(x: Collection[Any], y: Collection[Any]) -> float:
|
|
21
|
-
if not isinstance(x, Set):
|
|
22
|
-
x = set(x)
|
|
23
|
-
if not isinstance(y, Set):
|
|
24
|
-
y = set(y)
|
|
25
|
-
|
|
26
|
-
return dist2sim(jaccard_distance(x, y))
|
|
27
|
-
|
|
28
|
-
return wrapped_func
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|