PyPI - cbrkit - Versions diffs - 0.7.0__tar.gz → 0.9.0__tar.gz - Mend

cbrkit 0.7.0tar.gz → 0.9.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

{cbrkit-0.7.0 → cbrkit-0.9.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cbrkit
-Version: 0.7.0
+Version: 0.9.0
 Summary: Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI.
 Home-page: https://wi2trier.github.io/cbrkit/
 License: MIT
@@ -28,17 +28,22 @@ Provides-Extra: all
 Provides-Extra: api
 Provides-Extra: cli
 Provides-Extra: nlp
+Provides-Extra: timeseries
 Provides-Extra: transformers
+Requires-Dist: dtaidistance (>=2.3,<3.0) ; extra == "all" or extra == "timeseries"
 Requires-Dist: fastapi[all] (>=0.100,<1.0) ; extra == "all" or extra == "api"
 Requires-Dist: levenshtein (>=0.23,<1.0) ; extra == "all" or extra == "nlp"
+Requires-Dist: minineedle (>=3.1,<4.0) ; extra == "all" or extra == "timeseries"
 Requires-Dist: nltk (>=3.8,<4.0) ; extra == "all" or extra == "nlp"
 Requires-Dist: openai (>=1.5,<2.0) ; extra == "all" or extra == "nlp"
 Requires-Dist: orjson (>=3.9,<4.0)
 Requires-Dist: pandas (>=2.1,<3.0)
 Requires-Dist: pyarrow (>=13.0)
+Requires-Dist: pydantic (>=2.0,<3.0)
 Requires-Dist: pyyaml (>=6.0,<7.0)
 Requires-Dist: sentence-transformers (>=2.2,<3.0) ; extra == "all" or extra == "transformers"
-Requires-Dist: spacy (>=3.7,<4.0) ; extra == "all" or extra == "all" or extra == "nlp"
+Requires-Dist: setuptools (>=69,<70)
+Requires-Dist: spacy (>=3.7,<4.0) ; extra == "all" or extra == "nlp"
 Requires-Dist: torch (>=2.1.1,<3.0.0) ; extra == "all" or extra == "transformers"
 Requires-Dist: transformers (>=4.35,<5.0) ; extra == "all" or extra == "transformers"
 Requires-Dist: typer[all] (>=0.9,<1.0) ; extra == "all" or extra == "cli"

{cbrkit-0.7.0 → cbrkit-0.9.0}/cbrkit/__init__.py RENAMED Viewed

@@ -5,7 +5,6 @@
 """
 from . import helpers, loaders, retrieval, sim, typing
 __all__ = [

{cbrkit-0.7.0 → cbrkit-0.9.0}/cbrkit/cli.py RENAMED Viewed

@@ -2,7 +2,6 @@
 .. include:: ../cli.md
 """
 from pathlib import Path
 try:

{cbrkit-0.7.0 → cbrkit-0.9.0}/cbrkit/loaders.py RENAMED Viewed

@@ -1,7 +1,11 @@
+"""
+This module provides several loaders to read data from different file formats and convert it into a Casebase. To validate the data against a Pydantic model, a `validate` function is also provided.
+"""
 import csv as csvlib
 import tomllib
 from collections import abc
-from collections.abc import Callable, Iterator
+from collections.abc import Callable, Iterator, Mapping
 from importlib import import_module
 from pathlib import Path
 from typing import Any, cast
@@ -11,6 +15,7 @@ import pandas as pd
 import xmltodict
 import yaml as yamllib
 from pandas import DataFrame, Series
+from pydantic import BaseModel
 from cbrkit.typing import Casebase, FilePath
@@ -26,6 +31,7 @@ __all__ = [
     "python",
     "txt",
     "xml",
+    "validate",
 ]
@@ -325,6 +331,7 @@ def file(path: Path) -> Casebase[Any, Any] | None:
         >>> from pathlib import Path
         >>> file_path = Path("./data/cars-1k.csv")
         >>> result = file(file_path)
     """
     if path.suffix not in _batch_loaders:
         return None
@@ -348,16 +355,47 @@ def folder(path: Path, pattern: str) -> Casebase[Any, Any] | None:
     Examples:
         >>> from pathlib import Path
         >>> folder_path = Path("./data")
-        >>> result = folder(folder_path, ".csv")
+        >>> result = folder(folder_path, "*.csv")
+        >>> assert result is not None
     """
     cb: Casebase[Any, Any] = {}
     for file in path.glob(pattern):
         if file.is_file() and file.suffix in _single_loaders:
-            loader = _single_loaders[path.suffix]
+            loader = _single_loaders[file.suffix]
             cb[file.name] = loader(file)
     if len(cb) == 0:
         return None
     return cb
+def validate(data: Casebase[Any, Any] | Any, validation_model: BaseModel):
+    """Validates the data against a Pydantic model. Throws a ValueError if data is None or a Pydantic ValidationError if the data does not match the model.
+    Args:
+        data: Data to validate. Can be an entire case base or a single case.
+        validation_model: Pydantic model to validate the data.
+    Examples:
+        >>> from pydantic import BaseModel, PositiveInt, NonNegativeInt
+        >>> from data.cars_validation_model import Car
+        >>> from pathlib import Path
+        >>> data = path(Path("data/cars-1k.csv"))
+        >>> validate(data, Car)
+        >>> import pandas as pd
+        >>> df = pd.read_csv("data/cars-1k.csv")
+        >>> data = dataframe(df)
+        >>> validate(data, Car)
+    """
+    assert data is not None
+    if isinstance(data, DataFrameCasebase):
+        data = data.df.to_dict("index")
+    if isinstance(data, Mapping):
+        for item in data.values():
+            validation_model.model_validate(item)
+    else:
+        validation_model.model_validate(data)

cbrkit-0.9.0/cbrkit/sim/collections.py ADDED Viewed

@@ -0,0 +1,91 @@
+from collections.abc import Collection, Sequence, Set
+from typing import Any
+from cbrkit.helpers import dist2sim
+from cbrkit.typing import SimPairFunc
+Number = float | int
+__all__ = ["jaccard", "smith_waterman", "dtw"]
+def jaccard() -> SimPairFunc[Collection[Any], float]:
+    """Jaccard similarity function.
+    Examples:
+        >>> sim = jaccard()
+        >>> sim(["a", "b", "c", "d"], ["a", "b", "c"])
+        0.8
+    """
+    from nltk.metrics import jaccard_distance
+    def wrapped_func(x: Collection[Any], y: Collection[Any]) -> float:
+        if not isinstance(x, Set):
+            x = set(x)
+        if not isinstance(y, Set):
+            y = set(y)
+        return dist2sim(jaccard_distance(x, y))
+    return wrapped_func
+def smith_waterman(
+    match_score: int = 2, mismatch_penalty: int = -1, gap_penalty: int = -1
+) -> SimPairFunc[Sequence[Any], float]:
+    """
+    Performs the Smith-Waterman alignment with configurable scoring parameters. If no element matches it returns 0.0.
+    Args:
+        match_score: Score for matching characters. Defaults to 2.
+        mismatch_penalty: Penalty for mismatching characters. Defaults to -1.
+        gap_penalty: Penalty for gaps. Defaults to -1.
+    Example:
+        >>> sim = smith_waterman()
+        >>> sim("abcde", "fghe")
+        2
+    """
+    from minineedle import core, smith
+    def wrapped_func(x: Sequence[Any], y: Sequence[Any]) -> float:
+        try:
+            alignment = smith.SmithWaterman(x, y)
+            alignment.change_matrix(
+                core.ScoreMatrix(
+                    match=match_score, miss=mismatch_penalty, gap=gap_penalty
+                )
+            )
+            alignment.align()
+            return alignment.get_score()
+        except ZeroDivisionError:
+            return 0.0
+    return wrapped_func
+def dtw() -> SimPairFunc[Collection[int], float]:
+    """Dynamic Time Warping similarity function.
+    Examples:
+        >>> sim = dtw()
+        >>> sim([1, 2, 3], [1, 2, 3, 4])
+        0.5
+    """
+    import numpy as np
+    from dtaidistance import dtw
+    def wrapped_func(
+        x: Collection[Number] | np.ndarray, y: Collection[Number] | np.ndarray
+    ) -> float:
+        if not isinstance(x, np.ndarray):
+            x = np.array(x)
+        if not isinstance(y, np.ndarray):
+            y = np.array(y)
+        distance = dtw.distance(x, y)
+        return dist2sim(distance)
+    return wrapped_func

{cbrkit-0.7.0 → cbrkit-0.9.0}/cbrkit/typing.py RENAMED Viewed

@@ -32,20 +32,17 @@ SimSeqOrMap = SimMap[KeyType, SimType] | SimSeq[SimType]
 class SimMapFunc(Protocol[KeyType, ValueType_contra, SimType_cov]):
     def __call__(
         self, x_map: Mapping[KeyType, ValueType_contra], y: ValueType_contra
-    ) -> SimMap[KeyType, SimType_cov]:
-        ...
+    ) -> SimMap[KeyType, SimType_cov]: ...
 class SimSeqFunc(Protocol[ValueType_contra, SimType_cov]):
     def __call__(
         self, pairs: Sequence[tuple[ValueType_contra, ValueType_contra]], /
-    ) -> SimSeq[SimType_cov]:
-        ...
+    ) -> SimSeq[SimType_cov]: ...
 class SimPairFunc(Protocol[ValueType_contra, SimType_cov]):
-    def __call__(self, x: ValueType_contra, y: ValueType_contra, /) -> SimType_cov:
-        ...
+    def __call__(self, x: ValueType_contra, y: ValueType_contra, /) -> SimType_cov: ...
 AnySimFunc = (
@@ -60,8 +57,7 @@ class AggregatorFunc(Protocol[KeyType, SimType_contra]):
         self,
         similarities: SimSeqOrMap[KeyType, SimType_contra],
         /,
-    ) -> float:
-        ...
+    ) -> float: ...
 class PoolingFunc(Protocol):
@@ -69,5 +65,4 @@ class PoolingFunc(Protocol):
         self,
         similarities: SimSeq[float],
         /,
-    ) -> float:
-        ...
+    ) -> float: ...

{cbrkit-0.7.0 → cbrkit-0.9.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "cbrkit"
-version = "0.7.0"
+version = "0.9.0"
 description = "Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI."
 authors = ["Mirko Lenz <mirko@mirkolenz.com>"]
 license = "MIT"
@@ -40,8 +40,10 @@ cbrkit = "cbrkit.cli:app"
 [tool.poetry.dependencies]
 python = ">=3.11, <3.13"
+dtaidistance = { version = "^2.3", optional = true }
 fastapi = { version = ">=0.100, <1.0", optional = true, extras = ["all"] }
 levenshtein = { version = ">=0.23, <1.0", optional = true }
+minineedle = { version = "^3.1", optional = true }
 nltk = { version = "^3.8", optional = true }
 openai = { version = "^1.5", optional = true }
 orjson = "^3.9"
@@ -49,16 +51,19 @@ pandas = "^2.1"
 pyarrow = ">=13.0"
 pyyaml = "^6.0"
 sentence-transformers = { version = "^2.2", optional = true }
+setuptools = "^69"
 spacy = { version = "^3.7", optional = true }
 torch = { version = "^2.1.1", optional = true }
 transformers = { version = "^4.35", optional = true }
 typer = { version = ">=0.9, <1.0", extras = ["all"], optional = true }
 uvicorn = { version = ">=0.24, <1.0", optional = true, extras = ["standard"] }
 xmltodict = ">=0.13, <1.0"
+pydantic = "^2.0"
 [tool.poetry.group.dev.dependencies]
-pytest = "^8.0.0"
-pytest-cov = "^5.0.0"
+pytest = "^8.0"
+pytest-cov = "^5.0"
+ruff = "^0.3"
 [tool.poetry.group.docs.dependencies]
 pdoc = "^14.4"
@@ -71,16 +76,18 @@ all = [
     "openai",
     "sentence-transformers",
     "spacy",
-    "spacy",
     "torch",
     "transformers",
     "typer",
     "uvicorn",
+    "minineedle",
+    "dtaidistance",
 ]
 cli = ["typer"]
 api = ["fastapi", "uvicorn"]
 nlp = ["levenshtein", "nltk", "openai", "spacy"]
 transformers = ["sentence-transformers", "torch", "transformers"]
+timeseries = ["minineedle", "dtaidistance"]
 [tool.pytest.ini_options]
 addopts = "--cov cbrkit --cov-report term-missing --doctest-modules --ignore cbrkit/cli.py --ignore cbrkit/api.py --ignore result"
@@ -89,7 +96,7 @@ doctest_optionflags = "NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL ELLIPSIS"
 [tool.ruff]
 target-version = "py311"
-[tool.ruff.pydocstyle]
+[tool.ruff.lint.pydocstyle]
 convention = "google"
 [build-system]

cbrkit-0.7.0/cbrkit/sim/collections.py DELETED Viewed

@@ -1,28 +0,0 @@
-from collections.abc import Collection, Set
-from typing import Any
-from cbrkit.helpers import dist2sim
-from cbrkit.typing import SimPairFunc
-__all__ = ["jaccard"]
-def jaccard() -> SimPairFunc[Collection[Any], float]:
-    """Jaccard similarity function.
-    Examples:
-        >>> sim = jaccard()
-        >>> sim(["a", "b", "c", "d"], ["a", "b", "c"])
-        0.8
-    """
-    from nltk.metrics import jaccard_distance
-    def wrapped_func(x: Collection[Any], y: Collection[Any]) -> float:
-        if not isinstance(x, Set):
-            x = set(x)
-        if not isinstance(y, Set):
-            y = set(y)
-        return dist2sim(jaccard_distance(x, y))
-    return wrapped_func