cbrkit 0.7.0__tar.gz → 0.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cbrkit
3
- Version: 0.7.0
3
+ Version: 0.9.0
4
4
  Summary: Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI.
5
5
  Home-page: https://wi2trier.github.io/cbrkit/
6
6
  License: MIT
@@ -28,17 +28,22 @@ Provides-Extra: all
28
28
  Provides-Extra: api
29
29
  Provides-Extra: cli
30
30
  Provides-Extra: nlp
31
+ Provides-Extra: timeseries
31
32
  Provides-Extra: transformers
33
+ Requires-Dist: dtaidistance (>=2.3,<3.0) ; extra == "all" or extra == "timeseries"
32
34
  Requires-Dist: fastapi[all] (>=0.100,<1.0) ; extra == "all" or extra == "api"
33
35
  Requires-Dist: levenshtein (>=0.23,<1.0) ; extra == "all" or extra == "nlp"
36
+ Requires-Dist: minineedle (>=3.1,<4.0) ; extra == "all" or extra == "timeseries"
34
37
  Requires-Dist: nltk (>=3.8,<4.0) ; extra == "all" or extra == "nlp"
35
38
  Requires-Dist: openai (>=1.5,<2.0) ; extra == "all" or extra == "nlp"
36
39
  Requires-Dist: orjson (>=3.9,<4.0)
37
40
  Requires-Dist: pandas (>=2.1,<3.0)
38
41
  Requires-Dist: pyarrow (>=13.0)
42
+ Requires-Dist: pydantic (>=2.0,<3.0)
39
43
  Requires-Dist: pyyaml (>=6.0,<7.0)
40
44
  Requires-Dist: sentence-transformers (>=2.2,<3.0) ; extra == "all" or extra == "transformers"
41
- Requires-Dist: spacy (>=3.7,<4.0) ; extra == "all" or extra == "all" or extra == "nlp"
45
+ Requires-Dist: setuptools (>=69,<70)
46
+ Requires-Dist: spacy (>=3.7,<4.0) ; extra == "all" or extra == "nlp"
42
47
  Requires-Dist: torch (>=2.1.1,<3.0.0) ; extra == "all" or extra == "transformers"
43
48
  Requires-Dist: transformers (>=4.35,<5.0) ; extra == "all" or extra == "transformers"
44
49
  Requires-Dist: typer[all] (>=0.9,<1.0) ; extra == "all" or extra == "cli"
@@ -5,7 +5,6 @@
5
5
 
6
6
  """
7
7
 
8
-
9
8
  from . import helpers, loaders, retrieval, sim, typing
10
9
 
11
10
  __all__ = [
@@ -2,7 +2,6 @@
2
2
  .. include:: ../cli.md
3
3
  """
4
4
 
5
-
6
5
  from pathlib import Path
7
6
 
8
7
  try:
@@ -1,7 +1,11 @@
1
+ """
2
+ This module provides several loaders to read data from different file formats and convert it into a Casebase. To validate the data against a Pydantic model, a `validate` function is also provided.
3
+ """
4
+
1
5
  import csv as csvlib
2
6
  import tomllib
3
7
  from collections import abc
4
- from collections.abc import Callable, Iterator
8
+ from collections.abc import Callable, Iterator, Mapping
5
9
  from importlib import import_module
6
10
  from pathlib import Path
7
11
  from typing import Any, cast
@@ -11,6 +15,7 @@ import pandas as pd
11
15
  import xmltodict
12
16
  import yaml as yamllib
13
17
  from pandas import DataFrame, Series
18
+ from pydantic import BaseModel
14
19
 
15
20
  from cbrkit.typing import Casebase, FilePath
16
21
 
@@ -26,6 +31,7 @@ __all__ = [
26
31
  "python",
27
32
  "txt",
28
33
  "xml",
34
+ "validate",
29
35
  ]
30
36
 
31
37
 
@@ -325,6 +331,7 @@ def file(path: Path) -> Casebase[Any, Any] | None:
325
331
  >>> from pathlib import Path
326
332
  >>> file_path = Path("./data/cars-1k.csv")
327
333
  >>> result = file(file_path)
334
+
328
335
  """
329
336
  if path.suffix not in _batch_loaders:
330
337
  return None
@@ -348,16 +355,47 @@ def folder(path: Path, pattern: str) -> Casebase[Any, Any] | None:
348
355
  Examples:
349
356
  >>> from pathlib import Path
350
357
  >>> folder_path = Path("./data")
351
- >>> result = folder(folder_path, ".csv")
358
+ >>> result = folder(folder_path, "*.csv")
359
+ >>> assert result is not None
352
360
  """
353
361
  cb: Casebase[Any, Any] = {}
354
362
 
355
363
  for file in path.glob(pattern):
356
364
  if file.is_file() and file.suffix in _single_loaders:
357
- loader = _single_loaders[path.suffix]
365
+ loader = _single_loaders[file.suffix]
358
366
  cb[file.name] = loader(file)
359
367
 
360
368
  if len(cb) == 0:
361
369
  return None
362
370
 
363
371
  return cb
372
+
373
+
374
+ def validate(data: Casebase[Any, Any] | Any, validation_model: BaseModel):
375
+ """Validates the data against a Pydantic model. Throws a ValueError if data is None or a Pydantic ValidationError if the data does not match the model.
376
+
377
+ Args:
378
+ data: Data to validate. Can be an entire case base or a single case.
379
+ validation_model: Pydantic model to validate the data.
380
+
381
+ Examples:
382
+ >>> from pydantic import BaseModel, PositiveInt, NonNegativeInt
383
+ >>> from data.cars_validation_model import Car
384
+ >>> from pathlib import Path
385
+ >>> data = path(Path("data/cars-1k.csv"))
386
+ >>> validate(data, Car)
387
+ >>> import pandas as pd
388
+ >>> df = pd.read_csv("data/cars-1k.csv")
389
+ >>> data = dataframe(df)
390
+ >>> validate(data, Car)
391
+ """
392
+ assert data is not None
393
+
394
+ if isinstance(data, DataFrameCasebase):
395
+ data = data.df.to_dict("index")
396
+
397
+ if isinstance(data, Mapping):
398
+ for item in data.values():
399
+ validation_model.model_validate(item)
400
+ else:
401
+ validation_model.model_validate(data)
@@ -0,0 +1,91 @@
1
+ from collections.abc import Collection, Sequence, Set
2
+ from typing import Any
3
+
4
+ from cbrkit.helpers import dist2sim
5
+ from cbrkit.typing import SimPairFunc
6
+
7
+ Number = float | int
8
+
9
+ __all__ = ["jaccard", "smith_waterman", "dtw"]
10
+
11
+
12
+ def jaccard() -> SimPairFunc[Collection[Any], float]:
13
+ """Jaccard similarity function.
14
+
15
+ Examples:
16
+ >>> sim = jaccard()
17
+ >>> sim(["a", "b", "c", "d"], ["a", "b", "c"])
18
+ 0.8
19
+ """
20
+ from nltk.metrics import jaccard_distance
21
+
22
+ def wrapped_func(x: Collection[Any], y: Collection[Any]) -> float:
23
+ if not isinstance(x, Set):
24
+ x = set(x)
25
+ if not isinstance(y, Set):
26
+ y = set(y)
27
+
28
+ return dist2sim(jaccard_distance(x, y))
29
+
30
+ return wrapped_func
31
+
32
+
33
+ def smith_waterman(
34
+ match_score: int = 2, mismatch_penalty: int = -1, gap_penalty: int = -1
35
+ ) -> SimPairFunc[Sequence[Any], float]:
36
+ """
37
+ Performs the Smith-Waterman alignment with configurable scoring parameters. If no element matches it returns 0.0.
38
+
39
+ Args:
40
+ match_score: Score for matching characters. Defaults to 2.
41
+ mismatch_penalty: Penalty for mismatching characters. Defaults to -1.
42
+ gap_penalty: Penalty for gaps. Defaults to -1.
43
+
44
+ Example:
45
+ >>> sim = smith_waterman()
46
+ >>> sim("abcde", "fghe")
47
+ 2
48
+ """
49
+ from minineedle import core, smith
50
+
51
+ def wrapped_func(x: Sequence[Any], y: Sequence[Any]) -> float:
52
+ try:
53
+ alignment = smith.SmithWaterman(x, y)
54
+ alignment.change_matrix(
55
+ core.ScoreMatrix(
56
+ match=match_score, miss=mismatch_penalty, gap=gap_penalty
57
+ )
58
+ )
59
+ alignment.align()
60
+
61
+ return alignment.get_score()
62
+ except ZeroDivisionError:
63
+ return 0.0
64
+
65
+ return wrapped_func
66
+
67
+
68
+ def dtw() -> SimPairFunc[Collection[int], float]:
69
+ """Dynamic Time Warping similarity function.
70
+
71
+ Examples:
72
+ >>> sim = dtw()
73
+ >>> sim([1, 2, 3], [1, 2, 3, 4])
74
+ 0.5
75
+ """
76
+ import numpy as np
77
+ from dtaidistance import dtw
78
+
79
+ def wrapped_func(
80
+ x: Collection[Number] | np.ndarray, y: Collection[Number] | np.ndarray
81
+ ) -> float:
82
+ if not isinstance(x, np.ndarray):
83
+ x = np.array(x)
84
+ if not isinstance(y, np.ndarray):
85
+ y = np.array(y)
86
+
87
+ distance = dtw.distance(x, y)
88
+
89
+ return dist2sim(distance)
90
+
91
+ return wrapped_func
@@ -32,20 +32,17 @@ SimSeqOrMap = SimMap[KeyType, SimType] | SimSeq[SimType]
32
32
  class SimMapFunc(Protocol[KeyType, ValueType_contra, SimType_cov]):
33
33
  def __call__(
34
34
  self, x_map: Mapping[KeyType, ValueType_contra], y: ValueType_contra
35
- ) -> SimMap[KeyType, SimType_cov]:
36
- ...
35
+ ) -> SimMap[KeyType, SimType_cov]: ...
37
36
 
38
37
 
39
38
  class SimSeqFunc(Protocol[ValueType_contra, SimType_cov]):
40
39
  def __call__(
41
40
  self, pairs: Sequence[tuple[ValueType_contra, ValueType_contra]], /
42
- ) -> SimSeq[SimType_cov]:
43
- ...
41
+ ) -> SimSeq[SimType_cov]: ...
44
42
 
45
43
 
46
44
  class SimPairFunc(Protocol[ValueType_contra, SimType_cov]):
47
- def __call__(self, x: ValueType_contra, y: ValueType_contra, /) -> SimType_cov:
48
- ...
45
+ def __call__(self, x: ValueType_contra, y: ValueType_contra, /) -> SimType_cov: ...
49
46
 
50
47
 
51
48
  AnySimFunc = (
@@ -60,8 +57,7 @@ class AggregatorFunc(Protocol[KeyType, SimType_contra]):
60
57
  self,
61
58
  similarities: SimSeqOrMap[KeyType, SimType_contra],
62
59
  /,
63
- ) -> float:
64
- ...
60
+ ) -> float: ...
65
61
 
66
62
 
67
63
  class PoolingFunc(Protocol):
@@ -69,5 +65,4 @@ class PoolingFunc(Protocol):
69
65
  self,
70
66
  similarities: SimSeq[float],
71
67
  /,
72
- ) -> float:
73
- ...
68
+ ) -> float: ...
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "cbrkit"
3
- version = "0.7.0"
3
+ version = "0.9.0"
4
4
  description = "Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI."
5
5
  authors = ["Mirko Lenz <mirko@mirkolenz.com>"]
6
6
  license = "MIT"
@@ -40,8 +40,10 @@ cbrkit = "cbrkit.cli:app"
40
40
 
41
41
  [tool.poetry.dependencies]
42
42
  python = ">=3.11, <3.13"
43
+ dtaidistance = { version = "^2.3", optional = true }
43
44
  fastapi = { version = ">=0.100, <1.0", optional = true, extras = ["all"] }
44
45
  levenshtein = { version = ">=0.23, <1.0", optional = true }
46
+ minineedle = { version = "^3.1", optional = true }
45
47
  nltk = { version = "^3.8", optional = true }
46
48
  openai = { version = "^1.5", optional = true }
47
49
  orjson = "^3.9"
@@ -49,16 +51,19 @@ pandas = "^2.1"
49
51
  pyarrow = ">=13.0"
50
52
  pyyaml = "^6.0"
51
53
  sentence-transformers = { version = "^2.2", optional = true }
54
+ setuptools = "^69"
52
55
  spacy = { version = "^3.7", optional = true }
53
56
  torch = { version = "^2.1.1", optional = true }
54
57
  transformers = { version = "^4.35", optional = true }
55
58
  typer = { version = ">=0.9, <1.0", extras = ["all"], optional = true }
56
59
  uvicorn = { version = ">=0.24, <1.0", optional = true, extras = ["standard"] }
57
60
  xmltodict = ">=0.13, <1.0"
61
+ pydantic = "^2.0"
58
62
 
59
63
  [tool.poetry.group.dev.dependencies]
60
- pytest = "^8.0.0"
61
- pytest-cov = "^5.0.0"
64
+ pytest = "^8.0"
65
+ pytest-cov = "^5.0"
66
+ ruff = "^0.3"
62
67
 
63
68
  [tool.poetry.group.docs.dependencies]
64
69
  pdoc = "^14.4"
@@ -71,16 +76,18 @@ all = [
71
76
  "openai",
72
77
  "sentence-transformers",
73
78
  "spacy",
74
- "spacy",
75
79
  "torch",
76
80
  "transformers",
77
81
  "typer",
78
82
  "uvicorn",
83
+ "minineedle",
84
+ "dtaidistance",
79
85
  ]
80
86
  cli = ["typer"]
81
87
  api = ["fastapi", "uvicorn"]
82
88
  nlp = ["levenshtein", "nltk", "openai", "spacy"]
83
89
  transformers = ["sentence-transformers", "torch", "transformers"]
90
+ timeseries = ["minineedle", "dtaidistance"]
84
91
 
85
92
  [tool.pytest.ini_options]
86
93
  addopts = "--cov cbrkit --cov-report term-missing --doctest-modules --ignore cbrkit/cli.py --ignore cbrkit/api.py --ignore result"
@@ -89,7 +96,7 @@ doctest_optionflags = "NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL ELLIPSIS"
89
96
  [tool.ruff]
90
97
  target-version = "py311"
91
98
 
92
- [tool.ruff.pydocstyle]
99
+ [tool.ruff.lint.pydocstyle]
93
100
  convention = "google"
94
101
 
95
102
  [build-system]
@@ -1,28 +0,0 @@
1
- from collections.abc import Collection, Set
2
- from typing import Any
3
-
4
- from cbrkit.helpers import dist2sim
5
- from cbrkit.typing import SimPairFunc
6
-
7
- __all__ = ["jaccard"]
8
-
9
-
10
- def jaccard() -> SimPairFunc[Collection[Any], float]:
11
- """Jaccard similarity function.
12
-
13
- Examples:
14
- >>> sim = jaccard()
15
- >>> sim(["a", "b", "c", "d"], ["a", "b", "c"])
16
- 0.8
17
- """
18
- from nltk.metrics import jaccard_distance
19
-
20
- def wrapped_func(x: Collection[Any], y: Collection[Any]) -> float:
21
- if not isinstance(x, Set):
22
- x = set(x)
23
- if not isinstance(y, Set):
24
- y = set(y)
25
-
26
- return dist2sim(jaccard_distance(x, y))
27
-
28
- return wrapped_func
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes