cbrkit 0.7.0__tar.gz → 0.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cbrkit-0.7.0 → cbrkit-0.9.0}/PKG-INFO +7 -2
- {cbrkit-0.7.0 → cbrkit-0.9.0}/cbrkit/__init__.py +0 -1
- {cbrkit-0.7.0 → cbrkit-0.9.0}/cbrkit/cli.py +0 -1
- {cbrkit-0.7.0 → cbrkit-0.9.0}/cbrkit/loaders.py +41 -3
- cbrkit-0.9.0/cbrkit/sim/collections.py +91 -0
- {cbrkit-0.7.0 → cbrkit-0.9.0}/cbrkit/typing.py +5 -10
- {cbrkit-0.7.0 → cbrkit-0.9.0}/pyproject.toml +12 -5
- cbrkit-0.7.0/cbrkit/sim/collections.py +0 -28
- {cbrkit-0.7.0 → cbrkit-0.9.0}/LICENSE +0 -0
- {cbrkit-0.7.0 → cbrkit-0.9.0}/README.md +0 -0
- {cbrkit-0.7.0 → cbrkit-0.9.0}/cbrkit/__main__.py +0 -0
- {cbrkit-0.7.0 → cbrkit-0.9.0}/cbrkit/api.py +0 -0
- {cbrkit-0.7.0 → cbrkit-0.9.0}/cbrkit/helpers.py +0 -0
- {cbrkit-0.7.0 → cbrkit-0.9.0}/cbrkit/py.typed +0 -0
- {cbrkit-0.7.0 → cbrkit-0.9.0}/cbrkit/retrieval.py +0 -0
- {cbrkit-0.7.0 → cbrkit-0.9.0}/cbrkit/sim/__init__.py +0 -0
- {cbrkit-0.7.0 → cbrkit-0.9.0}/cbrkit/sim/_aggregator.py +0 -0
- {cbrkit-0.7.0 → cbrkit-0.9.0}/cbrkit/sim/_attribute_value.py +0 -0
- {cbrkit-0.7.0 → cbrkit-0.9.0}/cbrkit/sim/generic.py +0 -0
- {cbrkit-0.7.0 → cbrkit-0.9.0}/cbrkit/sim/graph/__init__.py +0 -0
- {cbrkit-0.7.0 → cbrkit-0.9.0}/cbrkit/sim/graph/_astar.py +0 -0
- {cbrkit-0.7.0 → cbrkit-0.9.0}/cbrkit/sim/graph/_model.py +0 -0
- {cbrkit-0.7.0 → cbrkit-0.9.0}/cbrkit/sim/numbers.py +0 -0
- {cbrkit-0.7.0 → cbrkit-0.9.0}/cbrkit/sim/strings/__init__.py +0 -0
- {cbrkit-0.7.0 → cbrkit-0.9.0}/cbrkit/sim/strings/taxonomy.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: cbrkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.9.0
|
|
4
4
|
Summary: Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI.
|
|
5
5
|
Home-page: https://wi2trier.github.io/cbrkit/
|
|
6
6
|
License: MIT
|
|
@@ -28,17 +28,22 @@ Provides-Extra: all
|
|
|
28
28
|
Provides-Extra: api
|
|
29
29
|
Provides-Extra: cli
|
|
30
30
|
Provides-Extra: nlp
|
|
31
|
+
Provides-Extra: timeseries
|
|
31
32
|
Provides-Extra: transformers
|
|
33
|
+
Requires-Dist: dtaidistance (>=2.3,<3.0) ; extra == "all" or extra == "timeseries"
|
|
32
34
|
Requires-Dist: fastapi[all] (>=0.100,<1.0) ; extra == "all" or extra == "api"
|
|
33
35
|
Requires-Dist: levenshtein (>=0.23,<1.0) ; extra == "all" or extra == "nlp"
|
|
36
|
+
Requires-Dist: minineedle (>=3.1,<4.0) ; extra == "all" or extra == "timeseries"
|
|
34
37
|
Requires-Dist: nltk (>=3.8,<4.0) ; extra == "all" or extra == "nlp"
|
|
35
38
|
Requires-Dist: openai (>=1.5,<2.0) ; extra == "all" or extra == "nlp"
|
|
36
39
|
Requires-Dist: orjson (>=3.9,<4.0)
|
|
37
40
|
Requires-Dist: pandas (>=2.1,<3.0)
|
|
38
41
|
Requires-Dist: pyarrow (>=13.0)
|
|
42
|
+
Requires-Dist: pydantic (>=2.0,<3.0)
|
|
39
43
|
Requires-Dist: pyyaml (>=6.0,<7.0)
|
|
40
44
|
Requires-Dist: sentence-transformers (>=2.2,<3.0) ; extra == "all" or extra == "transformers"
|
|
41
|
-
Requires-Dist:
|
|
45
|
+
Requires-Dist: setuptools (>=69,<70)
|
|
46
|
+
Requires-Dist: spacy (>=3.7,<4.0) ; extra == "all" or extra == "nlp"
|
|
42
47
|
Requires-Dist: torch (>=2.1.1,<3.0.0) ; extra == "all" or extra == "transformers"
|
|
43
48
|
Requires-Dist: transformers (>=4.35,<5.0) ; extra == "all" or extra == "transformers"
|
|
44
49
|
Requires-Dist: typer[all] (>=0.9,<1.0) ; extra == "all" or extra == "cli"
|
|
@@ -1,7 +1,11 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This module provides several loaders to read data from different file formats and convert it into a Casebase. To validate the data against a Pydantic model, a `validate` function is also provided.
|
|
3
|
+
"""
|
|
4
|
+
|
|
1
5
|
import csv as csvlib
|
|
2
6
|
import tomllib
|
|
3
7
|
from collections import abc
|
|
4
|
-
from collections.abc import Callable, Iterator
|
|
8
|
+
from collections.abc import Callable, Iterator, Mapping
|
|
5
9
|
from importlib import import_module
|
|
6
10
|
from pathlib import Path
|
|
7
11
|
from typing import Any, cast
|
|
@@ -11,6 +15,7 @@ import pandas as pd
|
|
|
11
15
|
import xmltodict
|
|
12
16
|
import yaml as yamllib
|
|
13
17
|
from pandas import DataFrame, Series
|
|
18
|
+
from pydantic import BaseModel
|
|
14
19
|
|
|
15
20
|
from cbrkit.typing import Casebase, FilePath
|
|
16
21
|
|
|
@@ -26,6 +31,7 @@ __all__ = [
|
|
|
26
31
|
"python",
|
|
27
32
|
"txt",
|
|
28
33
|
"xml",
|
|
34
|
+
"validate",
|
|
29
35
|
]
|
|
30
36
|
|
|
31
37
|
|
|
@@ -325,6 +331,7 @@ def file(path: Path) -> Casebase[Any, Any] | None:
|
|
|
325
331
|
>>> from pathlib import Path
|
|
326
332
|
>>> file_path = Path("./data/cars-1k.csv")
|
|
327
333
|
>>> result = file(file_path)
|
|
334
|
+
|
|
328
335
|
"""
|
|
329
336
|
if path.suffix not in _batch_loaders:
|
|
330
337
|
return None
|
|
@@ -348,16 +355,47 @@ def folder(path: Path, pattern: str) -> Casebase[Any, Any] | None:
|
|
|
348
355
|
Examples:
|
|
349
356
|
>>> from pathlib import Path
|
|
350
357
|
>>> folder_path = Path("./data")
|
|
351
|
-
>>> result = folder(folder_path, "
|
|
358
|
+
>>> result = folder(folder_path, "*.csv")
|
|
359
|
+
>>> assert result is not None
|
|
352
360
|
"""
|
|
353
361
|
cb: Casebase[Any, Any] = {}
|
|
354
362
|
|
|
355
363
|
for file in path.glob(pattern):
|
|
356
364
|
if file.is_file() and file.suffix in _single_loaders:
|
|
357
|
-
loader = _single_loaders[
|
|
365
|
+
loader = _single_loaders[file.suffix]
|
|
358
366
|
cb[file.name] = loader(file)
|
|
359
367
|
|
|
360
368
|
if len(cb) == 0:
|
|
361
369
|
return None
|
|
362
370
|
|
|
363
371
|
return cb
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def validate(data: Casebase[Any, Any] | Any, validation_model: BaseModel):
|
|
375
|
+
"""Validates the data against a Pydantic model. Throws a ValueError if data is None or a Pydantic ValidationError if the data does not match the model.
|
|
376
|
+
|
|
377
|
+
Args:
|
|
378
|
+
data: Data to validate. Can be an entire case base or a single case.
|
|
379
|
+
validation_model: Pydantic model to validate the data.
|
|
380
|
+
|
|
381
|
+
Examples:
|
|
382
|
+
>>> from pydantic import BaseModel, PositiveInt, NonNegativeInt
|
|
383
|
+
>>> from data.cars_validation_model import Car
|
|
384
|
+
>>> from pathlib import Path
|
|
385
|
+
>>> data = path(Path("data/cars-1k.csv"))
|
|
386
|
+
>>> validate(data, Car)
|
|
387
|
+
>>> import pandas as pd
|
|
388
|
+
>>> df = pd.read_csv("data/cars-1k.csv")
|
|
389
|
+
>>> data = dataframe(df)
|
|
390
|
+
>>> validate(data, Car)
|
|
391
|
+
"""
|
|
392
|
+
assert data is not None
|
|
393
|
+
|
|
394
|
+
if isinstance(data, DataFrameCasebase):
|
|
395
|
+
data = data.df.to_dict("index")
|
|
396
|
+
|
|
397
|
+
if isinstance(data, Mapping):
|
|
398
|
+
for item in data.values():
|
|
399
|
+
validation_model.model_validate(item)
|
|
400
|
+
else:
|
|
401
|
+
validation_model.model_validate(data)
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
from collections.abc import Collection, Sequence, Set
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from cbrkit.helpers import dist2sim
|
|
5
|
+
from cbrkit.typing import SimPairFunc
|
|
6
|
+
|
|
7
|
+
Number = float | int
|
|
8
|
+
|
|
9
|
+
__all__ = ["jaccard", "smith_waterman", "dtw"]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def jaccard() -> SimPairFunc[Collection[Any], float]:
|
|
13
|
+
"""Jaccard similarity function.
|
|
14
|
+
|
|
15
|
+
Examples:
|
|
16
|
+
>>> sim = jaccard()
|
|
17
|
+
>>> sim(["a", "b", "c", "d"], ["a", "b", "c"])
|
|
18
|
+
0.8
|
|
19
|
+
"""
|
|
20
|
+
from nltk.metrics import jaccard_distance
|
|
21
|
+
|
|
22
|
+
def wrapped_func(x: Collection[Any], y: Collection[Any]) -> float:
|
|
23
|
+
if not isinstance(x, Set):
|
|
24
|
+
x = set(x)
|
|
25
|
+
if not isinstance(y, Set):
|
|
26
|
+
y = set(y)
|
|
27
|
+
|
|
28
|
+
return dist2sim(jaccard_distance(x, y))
|
|
29
|
+
|
|
30
|
+
return wrapped_func
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def smith_waterman(
|
|
34
|
+
match_score: int = 2, mismatch_penalty: int = -1, gap_penalty: int = -1
|
|
35
|
+
) -> SimPairFunc[Sequence[Any], float]:
|
|
36
|
+
"""
|
|
37
|
+
Performs the Smith-Waterman alignment with configurable scoring parameters. If no element matches it returns 0.0.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
match_score: Score for matching characters. Defaults to 2.
|
|
41
|
+
mismatch_penalty: Penalty for mismatching characters. Defaults to -1.
|
|
42
|
+
gap_penalty: Penalty for gaps. Defaults to -1.
|
|
43
|
+
|
|
44
|
+
Example:
|
|
45
|
+
>>> sim = smith_waterman()
|
|
46
|
+
>>> sim("abcde", "fghe")
|
|
47
|
+
2
|
|
48
|
+
"""
|
|
49
|
+
from minineedle import core, smith
|
|
50
|
+
|
|
51
|
+
def wrapped_func(x: Sequence[Any], y: Sequence[Any]) -> float:
|
|
52
|
+
try:
|
|
53
|
+
alignment = smith.SmithWaterman(x, y)
|
|
54
|
+
alignment.change_matrix(
|
|
55
|
+
core.ScoreMatrix(
|
|
56
|
+
match=match_score, miss=mismatch_penalty, gap=gap_penalty
|
|
57
|
+
)
|
|
58
|
+
)
|
|
59
|
+
alignment.align()
|
|
60
|
+
|
|
61
|
+
return alignment.get_score()
|
|
62
|
+
except ZeroDivisionError:
|
|
63
|
+
return 0.0
|
|
64
|
+
|
|
65
|
+
return wrapped_func
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def dtw() -> SimPairFunc[Collection[int], float]:
|
|
69
|
+
"""Dynamic Time Warping similarity function.
|
|
70
|
+
|
|
71
|
+
Examples:
|
|
72
|
+
>>> sim = dtw()
|
|
73
|
+
>>> sim([1, 2, 3], [1, 2, 3, 4])
|
|
74
|
+
0.5
|
|
75
|
+
"""
|
|
76
|
+
import numpy as np
|
|
77
|
+
from dtaidistance import dtw
|
|
78
|
+
|
|
79
|
+
def wrapped_func(
|
|
80
|
+
x: Collection[Number] | np.ndarray, y: Collection[Number] | np.ndarray
|
|
81
|
+
) -> float:
|
|
82
|
+
if not isinstance(x, np.ndarray):
|
|
83
|
+
x = np.array(x)
|
|
84
|
+
if not isinstance(y, np.ndarray):
|
|
85
|
+
y = np.array(y)
|
|
86
|
+
|
|
87
|
+
distance = dtw.distance(x, y)
|
|
88
|
+
|
|
89
|
+
return dist2sim(distance)
|
|
90
|
+
|
|
91
|
+
return wrapped_func
|
|
@@ -32,20 +32,17 @@ SimSeqOrMap = SimMap[KeyType, SimType] | SimSeq[SimType]
|
|
|
32
32
|
class SimMapFunc(Protocol[KeyType, ValueType_contra, SimType_cov]):
|
|
33
33
|
def __call__(
|
|
34
34
|
self, x_map: Mapping[KeyType, ValueType_contra], y: ValueType_contra
|
|
35
|
-
) -> SimMap[KeyType, SimType_cov]:
|
|
36
|
-
...
|
|
35
|
+
) -> SimMap[KeyType, SimType_cov]: ...
|
|
37
36
|
|
|
38
37
|
|
|
39
38
|
class SimSeqFunc(Protocol[ValueType_contra, SimType_cov]):
|
|
40
39
|
def __call__(
|
|
41
40
|
self, pairs: Sequence[tuple[ValueType_contra, ValueType_contra]], /
|
|
42
|
-
) -> SimSeq[SimType_cov]:
|
|
43
|
-
...
|
|
41
|
+
) -> SimSeq[SimType_cov]: ...
|
|
44
42
|
|
|
45
43
|
|
|
46
44
|
class SimPairFunc(Protocol[ValueType_contra, SimType_cov]):
|
|
47
|
-
def __call__(self, x: ValueType_contra, y: ValueType_contra, /) -> SimType_cov:
|
|
48
|
-
...
|
|
45
|
+
def __call__(self, x: ValueType_contra, y: ValueType_contra, /) -> SimType_cov: ...
|
|
49
46
|
|
|
50
47
|
|
|
51
48
|
AnySimFunc = (
|
|
@@ -60,8 +57,7 @@ class AggregatorFunc(Protocol[KeyType, SimType_contra]):
|
|
|
60
57
|
self,
|
|
61
58
|
similarities: SimSeqOrMap[KeyType, SimType_contra],
|
|
62
59
|
/,
|
|
63
|
-
) -> float:
|
|
64
|
-
...
|
|
60
|
+
) -> float: ...
|
|
65
61
|
|
|
66
62
|
|
|
67
63
|
class PoolingFunc(Protocol):
|
|
@@ -69,5 +65,4 @@ class PoolingFunc(Protocol):
|
|
|
69
65
|
self,
|
|
70
66
|
similarities: SimSeq[float],
|
|
71
67
|
/,
|
|
72
|
-
) -> float:
|
|
73
|
-
...
|
|
68
|
+
) -> float: ...
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "cbrkit"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.9.0"
|
|
4
4
|
description = "Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI."
|
|
5
5
|
authors = ["Mirko Lenz <mirko@mirkolenz.com>"]
|
|
6
6
|
license = "MIT"
|
|
@@ -40,8 +40,10 @@ cbrkit = "cbrkit.cli:app"
|
|
|
40
40
|
|
|
41
41
|
[tool.poetry.dependencies]
|
|
42
42
|
python = ">=3.11, <3.13"
|
|
43
|
+
dtaidistance = { version = "^2.3", optional = true }
|
|
43
44
|
fastapi = { version = ">=0.100, <1.0", optional = true, extras = ["all"] }
|
|
44
45
|
levenshtein = { version = ">=0.23, <1.0", optional = true }
|
|
46
|
+
minineedle = { version = "^3.1", optional = true }
|
|
45
47
|
nltk = { version = "^3.8", optional = true }
|
|
46
48
|
openai = { version = "^1.5", optional = true }
|
|
47
49
|
orjson = "^3.9"
|
|
@@ -49,16 +51,19 @@ pandas = "^2.1"
|
|
|
49
51
|
pyarrow = ">=13.0"
|
|
50
52
|
pyyaml = "^6.0"
|
|
51
53
|
sentence-transformers = { version = "^2.2", optional = true }
|
|
54
|
+
setuptools = "^69"
|
|
52
55
|
spacy = { version = "^3.7", optional = true }
|
|
53
56
|
torch = { version = "^2.1.1", optional = true }
|
|
54
57
|
transformers = { version = "^4.35", optional = true }
|
|
55
58
|
typer = { version = ">=0.9, <1.0", extras = ["all"], optional = true }
|
|
56
59
|
uvicorn = { version = ">=0.24, <1.0", optional = true, extras = ["standard"] }
|
|
57
60
|
xmltodict = ">=0.13, <1.0"
|
|
61
|
+
pydantic = "^2.0"
|
|
58
62
|
|
|
59
63
|
[tool.poetry.group.dev.dependencies]
|
|
60
|
-
pytest = "^8.0
|
|
61
|
-
pytest-cov = "^5.0
|
|
64
|
+
pytest = "^8.0"
|
|
65
|
+
pytest-cov = "^5.0"
|
|
66
|
+
ruff = "^0.3"
|
|
62
67
|
|
|
63
68
|
[tool.poetry.group.docs.dependencies]
|
|
64
69
|
pdoc = "^14.4"
|
|
@@ -71,16 +76,18 @@ all = [
|
|
|
71
76
|
"openai",
|
|
72
77
|
"sentence-transformers",
|
|
73
78
|
"spacy",
|
|
74
|
-
"spacy",
|
|
75
79
|
"torch",
|
|
76
80
|
"transformers",
|
|
77
81
|
"typer",
|
|
78
82
|
"uvicorn",
|
|
83
|
+
"minineedle",
|
|
84
|
+
"dtaidistance",
|
|
79
85
|
]
|
|
80
86
|
cli = ["typer"]
|
|
81
87
|
api = ["fastapi", "uvicorn"]
|
|
82
88
|
nlp = ["levenshtein", "nltk", "openai", "spacy"]
|
|
83
89
|
transformers = ["sentence-transformers", "torch", "transformers"]
|
|
90
|
+
timeseries = ["minineedle", "dtaidistance"]
|
|
84
91
|
|
|
85
92
|
[tool.pytest.ini_options]
|
|
86
93
|
addopts = "--cov cbrkit --cov-report term-missing --doctest-modules --ignore cbrkit/cli.py --ignore cbrkit/api.py --ignore result"
|
|
@@ -89,7 +96,7 @@ doctest_optionflags = "NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL ELLIPSIS"
|
|
|
89
96
|
[tool.ruff]
|
|
90
97
|
target-version = "py311"
|
|
91
98
|
|
|
92
|
-
[tool.ruff.pydocstyle]
|
|
99
|
+
[tool.ruff.lint.pydocstyle]
|
|
93
100
|
convention = "google"
|
|
94
101
|
|
|
95
102
|
[build-system]
|
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
from collections.abc import Collection, Set
|
|
2
|
-
from typing import Any
|
|
3
|
-
|
|
4
|
-
from cbrkit.helpers import dist2sim
|
|
5
|
-
from cbrkit.typing import SimPairFunc
|
|
6
|
-
|
|
7
|
-
__all__ = ["jaccard"]
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def jaccard() -> SimPairFunc[Collection[Any], float]:
|
|
11
|
-
"""Jaccard similarity function.
|
|
12
|
-
|
|
13
|
-
Examples:
|
|
14
|
-
>>> sim = jaccard()
|
|
15
|
-
>>> sim(["a", "b", "c", "d"], ["a", "b", "c"])
|
|
16
|
-
0.8
|
|
17
|
-
"""
|
|
18
|
-
from nltk.metrics import jaccard_distance
|
|
19
|
-
|
|
20
|
-
def wrapped_func(x: Collection[Any], y: Collection[Any]) -> float:
|
|
21
|
-
if not isinstance(x, Set):
|
|
22
|
-
x = set(x)
|
|
23
|
-
if not isinstance(y, Set):
|
|
24
|
-
y = set(y)
|
|
25
|
-
|
|
26
|
-
return dist2sim(jaccard_distance(x, y))
|
|
27
|
-
|
|
28
|
-
return wrapped_func
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|