PyPI - cbrkit - Versions diffs - 0.2.1__tar.gz → 0.3.1__tar.gz - Mend

cbrkit 0.2.1tar.gz → 0.3.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

{cbrkit-0.2.1 → cbrkit-0.3.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cbrkit
-Version: 0.2.1
+Version: 0.3.1
 Summary: Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI.
 Home-page: https://wi2trier.github.io/cbrkit/
 License: MIT
@@ -28,20 +28,21 @@ Provides-Extra: all
 Provides-Extra: api
 Provides-Extra: cli
 Provides-Extra: nlp
-Requires-Dist: fastapi[all] (>=0.104,<0.105) ; extra == "all" or extra == "api"
-Requires-Dist: levenshtein (>=0.23,<0.24) ; extra == "all" or extra == "nlp"
+Provides-Extra: transformers
+Requires-Dist: fastapi[all] (>=0.100,<1.0) ; extra == "all" or extra == "api"
+Requires-Dist: levenshtein (>=0.23,<1.0) ; extra == "all" or extra == "nlp"
 Requires-Dist: nltk (>=3.8,<4.0) ; extra == "all" or extra == "nlp"
-Requires-Dist: openai (>=1.3,<2.0) ; extra == "all" or extra == "nlp"
+Requires-Dist: openai (>=1.5,<2.0) ; extra == "all" or extra == "nlp"
 Requires-Dist: orjson (>=3.9,<4.0)
 Requires-Dist: pandas (>=2.1,<3.0)
-Requires-Dist: pyarrow (>=14.0,<15.0)
+Requires-Dist: pyarrow (>=13.0)
 Requires-Dist: pyyaml (>=6.0,<7.0)
-Requires-Dist: sentence-transformers (>=2.2,<3.0) ; extra == "all" or extra == "nlp"
-Requires-Dist: spacy (>=3.7,<4.0) ; extra == "all" or extra == "nlp"
-Requires-Dist: torch (>=2.1.1,<3.0.0) ; extra == "all" or extra == "nlp"
-Requires-Dist: transformers (>=4.36,<5.0) ; extra == "all" or extra == "nlp"
+Requires-Dist: sentence-transformers (>=2.2,<3.0) ; extra == "all" or extra == "transformers"
+Requires-Dist: spacy (>=3.7,<4.0) ; extra == "all" or extra == "all" or extra == "nlp"
+Requires-Dist: torch (>=2.1.1,<3.0.0) ; extra == "all" or extra == "transformers"
+Requires-Dist: transformers (>=4.35,<5.0) ; extra == "all" or extra == "transformers"
 Requires-Dist: typer[all] (>=0.9,<0.10) ; extra == "all" or extra == "cli"
-Requires-Dist: uvicorn[standard] (>=0.24,<0.25) ; extra == "all" or extra == "api"
+Requires-Dist: uvicorn[standard] (>=0.24,<1.0) ; extra == "all" or extra == "api"
 Requires-Dist: xmltodict (>=0.13,<0.14)
 Project-URL: Repository, https://github.com/wi2trier/cbrkit
 Description-Content-Type: text/markdown
@@ -64,11 +65,6 @@ Description-Content-Type: text/markdown
 # CBRkit
-> [!caution]
-> The project is under active development and does not yet adhere to semantic versioning.
-> Breaking changes may occur at any time for versions `0.x.y`.
-> Once the project reaches version `1.0`, semantic versioning will be applied.
 ## Installation
 The library is available on [PyPI](https://pypi.org/project/cbrkit/), so you can install it with `pip`:
@@ -85,7 +81,8 @@ pip install cbrkit[EXTRA_NAME,...]
 where `EXTRA_NAME` is one of the following:
-- `nlp`: Natural Language Processing (NLP), including `spacy`, `openai`, and `sentence-transformers`
+- `nlp`: Standalone NLP tools `levenshtein`, `nltk`, `openai`, and `spacy`
+- `transformers`: NLP tools based on `pytorch` and `transformers`
 - `cli`: Command Line Interface (CLI)
 - `api`: REST API Server
 - `all`: All of the above
@@ -95,12 +92,36 @@ where `EXTRA_NAME` is one of the following:
 CBRkit allows the definition of similarity metrics through _composition_.
 This means that you can easily build even complex similarities by mixing built-in and/or custom measures.
 CBRkit also includes predefined aggregation functions.
-A working retrieval example can be found as part of our [testing suite](https://github.com/wi2trier/cbrkit/tree/main/tests/test_retrieve.py).
+To get started, we provide a [demo project](https://github.com/wi2trier/cbrkit-demo) that shows how to use the library in a real-world scenario.
 The following modules are part of CBRkit:
+- `loaders`: Functions for loading cases and queries.
 - `sim`: Similarity generator functions for various data types (e.g., strings, numbers).
 - `global_sim`: Similarity generator functions for aggregating the above ones.
 - `retrieval`: Functions for retrieving cases based on a query.
 - `typing`: Generic type definitions for defining custom functions.
+CBRkit is fully typed, so IDEs like VSCode and PyCharm can provide autocompletion and type checking.
+We will explain all modules and their basic usage in the following sections.
+### Loading Cases
+The first step is to load cases and queries.
+We provide predefined functions for the most common formats like CSV, JSON, and XML.
+Additionally, `cbrkit` also integrates with `pandas` for loading data frames.
+The following example shows how to load cases and queries from a CSV file using `pandas`:
+```python
+import pandas as pd
+import cbrkit
+df = pd.read_csv("path/to/cases.csv")
+cases = cbrkit.loaders.dataframe(df)
+```
+Queries can either be loaded using the same loader functions or constructed manually.
+```python
+queries = cbrkit.loaders.dataframe(pd.read_csv("path/to/queries.csv"))
+```

{cbrkit-0.2.1 → cbrkit-0.3.1}/README.md RENAMED Viewed

@@ -16,11 +16,6 @@
 # CBRkit
-> [!caution]
-> The project is under active development and does not yet adhere to semantic versioning.
-> Breaking changes may occur at any time for versions `0.x.y`.
-> Once the project reaches version `1.0`, semantic versioning will be applied.
 ## Installation
 The library is available on [PyPI](https://pypi.org/project/cbrkit/), so you can install it with `pip`:
@@ -37,7 +32,8 @@ pip install cbrkit[EXTRA_NAME,...]
 where `EXTRA_NAME` is one of the following:
-- `nlp`: Natural Language Processing (NLP), including `spacy`, `openai`, and `sentence-transformers`
+- `nlp`: Standalone NLP tools `levenshtein`, `nltk`, `openai`, and `spacy`
+- `transformers`: NLP tools based on `pytorch` and `transformers`
 - `cli`: Command Line Interface (CLI)
 - `api`: REST API Server
 - `all`: All of the above
@@ -47,11 +43,35 @@ where `EXTRA_NAME` is one of the following:
 CBRkit allows the definition of similarity metrics through _composition_.
 This means that you can easily build even complex similarities by mixing built-in and/or custom measures.
 CBRkit also includes predefined aggregation functions.
-A working retrieval example can be found as part of our [testing suite](https://github.com/wi2trier/cbrkit/tree/main/tests/test_retrieve.py).
+To get started, we provide a [demo project](https://github.com/wi2trier/cbrkit-demo) that shows how to use the library in a real-world scenario.
 The following modules are part of CBRkit:
+- `loaders`: Functions for loading cases and queries.
 - `sim`: Similarity generator functions for various data types (e.g., strings, numbers).
 - `global_sim`: Similarity generator functions for aggregating the above ones.
 - `retrieval`: Functions for retrieving cases based on a query.
 - `typing`: Generic type definitions for defining custom functions.
+CBRkit is fully typed, so IDEs like VSCode and PyCharm can provide autocompletion and type checking.
+We will explain all modules and their basic usage in the following sections.
+### Loading Cases
+The first step is to load cases and queries.
+We provide predefined functions for the most common formats like CSV, JSON, and XML.
+Additionally, `cbrkit` also integrates with `pandas` for loading data frames.
+The following example shows how to load cases and queries from a CSV file using `pandas`:
+```python
+import pandas as pd
+import cbrkit
+df = pd.read_csv("path/to/cases.csv")
+cases = cbrkit.loaders.dataframe(df)
+```
+Queries can either be loaded using the same loader functions or constructed manually.
+```python
+queries = cbrkit.loaders.dataframe(pd.read_csv("path/to/queries.csv"))
+```

{cbrkit-0.2.1 → cbrkit-0.3.1}/cbrkit/global_sim/_aggregate.py RENAMED Viewed

@@ -51,6 +51,20 @@ def aggregator(
     pooling_weights: SimSeqOrMap[KeyType, float] | None = None,
     default_pooling_weight: float = 1.0,
 ) -> AggregatorFunc[KeyType, AnyFloat]:
+    """
+    Aggregates local similarities to a global similarity using the specified pooling function.
+    Args:
+        pooling: The pooling function to use. It can be either a string representing the name of the pooling function or a custom pooling function (see `cbrkit.typing.PoolingFunc`).
+        pooling_weights: The weights to apply to the similarities during pooling. It can be a sequence or a mapping. If None, every local similarity is weighted equally.
+        default_pooling_weight: The default weight to use if a similarity key is not found in the pooling_weights mapping.
+    Examples:
+        >>> global_sim = aggregator("mean")
+        >>> global_sim([0.5, 0.75, 1.0])
+        0.75
+    """
     pooling_func = _pooling_funcs[pooling] if isinstance(pooling, str) else pooling
     def wrapped_func(similarities: SimSeqOrMap[KeyType, AnyFloat]) -> float:

{cbrkit-0.2.1 → cbrkit-0.3.1}/cbrkit/global_sim/_attribute_value.py RENAMED Viewed

@@ -60,6 +60,19 @@ def attribute_value(
     value_getter: Callable[[Any, str], Any] = _value_getter,
     key_getter: Callable[[Any], Iterator[str]] = _key_getter,
 ) -> SimMapFunc[Any, AttributeValueData, AttributeValueSim[SimType]]:
+    """
+    Similarity function that computes the attribute value similarity between two cases.
+    Args:
+        attributes: A mapping of attribute names to the similarity functions to be used for those attributes. Takes precedence over types.
+        types: A mapping of attribute types to the similarity functions to be used for those types.
+        types_fallback: A similarity function to be used as a fallback when no specific similarity function
+            is defined for an attribute type.
+        aggregator: A function that aggregates the local similarity scores for each attribute into a single global similarity.
+        value_getter: A function that retrieves the value of an attribute from a case.
+        key_getter: A function that retrieves the attribute names from a target case.
+    """
     attributes_map: Mapping[str, AnySimFunc[KeyType, Any, SimType]] = (
         {} if attributes is None else attributes
     )

{cbrkit-0.2.1 → cbrkit-0.3.1}/cbrkit/global_sim/graph/_astar.py RENAMED Viewed

@@ -22,7 +22,7 @@ from cbrkit.typing import Casebase, FloatProtocol, KeyType, SimPairFunc, SimType
 logger = logging.getLogger(__name__)
-@dataclass
+@dataclass(slots=True)
 class GraphMapping(Generic[GraphData, NodeKey, NodeData, EdgeKey, EdgeData]):
     """Store all mappings and perform integrity checks on them"""
@@ -107,7 +107,7 @@ class GraphMapping(Generic[GraphData, NodeKey, NodeData, EdgeKey, EdgeData]):
         self.edge_mappings[x] = y
-@dataclass
+@dataclass(slots=True)
 class SearchNode(Generic[GraphData, NodeKey, NodeData, EdgeKey, EdgeData]):
     """Specific search node"""
@@ -149,6 +149,18 @@ def astar(
     edge_sim_func: SimPairFunc[EdgeData, SimType],
     queue_limit: int,
 ) -> dict[KeyType, GraphSim[GraphData, NodeKey, NodeData, EdgeKey, EdgeData]]:
+    """
+    Performs the A* algorithm proposed by [Bergmann and Gil (2014)](https://doi.org/10.1016/j.is.2012.07.005) to compute the similarity between a query graph and the graphs in the casebase.
+    Args:
+        x_map: A casebase of graphs
+        y: Query graph
+        node_sim_func: A similarity function for graph nodes
+        edge_sim_func: A similarity function for graph edges
+        queue_limit: Limits the queue size which prunes the search space. This leads to a faster search and less memory usage but also introduces a similarity error.
+    """
     results = {
         key: _astar_single(
             x,

{cbrkit-0.2.1 → cbrkit-0.3.1}/cbrkit/global_sim/graph/_model.py RENAMED Viewed

@@ -19,7 +19,7 @@ class NodeProtocol(Hashable, Protocol[NodeData]):
     data: NodeData
-@dataclass
+@dataclass(slots=True)
 class Graph(Generic[GraphData, NodeKey, NodeData, EdgeKey, EdgeData]):
     nodes: dict[NodeKey, NodeProtocol[NodeData]]
     edges: dict[EdgeKey, EdgeProtocol[EdgeData, NodeKey]]

{cbrkit-0.2.1 → cbrkit-0.3.1}/cbrkit/loaders.py RENAMED Viewed

@@ -53,6 +53,8 @@ def python(import_name: str) -> Any:
 class DataFrameCasebase(abc.Mapping):
+    __slots__ = ("df",)
     df: DataFrame
     def __init__(self, df: DataFrame) -> None:
@@ -74,10 +76,35 @@ class DataFrameCasebase(abc.Mapping):
 def dataframe(df: DataFrame) -> Casebase[Any, pd.Series]:
+    """Converts a pandas DataFrame into a Casebase.
+    Args:
+        df: pandas DataFrame.
+    Returns:
+        Returns a Casebase as a DataFrameCasebase.
+    Examples:
+        >>> file_path = "./data/cars-1k.csv"
+        >>> df = pd.read_csv(file_path)
+        >>> result = dataframe(df)
+    """
     return DataFrameCasebase(df)
 def csv(path: FilePath) -> dict[int, dict[str, str]]:
+    """Reads a csv file and converts it into a dict representation
+    Args:
+        path: File path of the csv file
+    Returns:
+        Dict representation of the csv file.
+    Examples:
+        >>> file_path = "./data/cars-1k.csv"
+        >>> result = csv(file_path)
+    """
     data: dict[int, dict[str, str]] = {}
     with open(path) as fp:
@@ -96,32 +123,105 @@ def _csv_pandas(path: FilePath) -> dict[int, pd.Series]:
     return cast(dict[int, pd.Series], dataframe(df))
-def json(path: FilePath) -> dict[str, Any]:
+def json(path: FilePath) -> dict[Any, Any]:
+    """Reads a json file and converts it into a dict representation
+    Args:
+        path: File path of the json file
+    Returns:
+        Dict representation of the json file.
+    Examples:
+        >>> file_path = "data/cars-1k.json"     # doctest: +SKIP
+        >>> json(file_path)                     # doctest: +SKIP
+    """
     with open(path, "rb") as fp:
-        return orjson.loads(fp.read())
+        data = orjson.loads(fp.read())
+        if isinstance(data, list):
+            return dict(enumerate(data))
+        elif isinstance(data, dict):
+            return data
+        else:
+            raise TypeError(f"Invalid data type: {type(data)}")
 def toml(path: FilePath) -> dict[str, Any]:
+    """Reads a toml file and parses it into a dict representation
+    Args:
+        path: File path of the toml file
+    Returns:
+        Dict representation of the toml file.
+    Examples:
+        >>> file_path = "./data/file.toml"      # doctest: +SKIP
+        >>> toml(file_path)                     # doctest: +SKIP
+    """
     with open(path, "rb") as fp:
         return tomllib.load(fp)
-def yaml(path: FilePath) -> dict[str, Any]:
-    data: dict[str, Any] = {}
+def yaml(path: FilePath) -> dict[Any, Any]:
+    """Reads a yaml file and parses it into a dict representation
+    Args:
+        path: File path of the yaml file
+    Returns:
+        Dict representation of the yaml file.
+    Examples:
+        >>> file_path = "./data/cars-1k.yaml"
+        >>> result = yaml(file_path)
+    """
+    data: dict[Any, Any] = {}
     with open(path, "rb") as fp:
-        for doc in yamllib.safe_load_all(fp):
-            data |= doc
+        for doc_idx, doc in enumerate(yamllib.safe_load_all(fp)):
+            if isinstance(doc, list):
+                for idx, item in enumerate(doc):
+                    data[doc_idx + idx] = item
+            elif isinstance(doc, dict):
+                data |= doc
+            else:
+                raise TypeError(f"Invalid document type: {type(doc)}")
     return data
 def txt(path: FilePath) -> str:
+    """Reads a text file and converts it into a string
+    Args:
+        path: File path of the text file
+    Returns:
+        String representation of the text file.
+    Examples:
+        >>> file_path = "data/file.txt"      # doctest: +SKIP
+        >>> txt(file_path)                   # doctest: +SKIP
+    """
     with open(path) as fp:
         return fp.read()
 def xml(path: FilePath) -> dict[str, Any]:
+    """Reads a xml file and parses it into a dict representation
+    Args:
+        path: File path of the xml file
+    Returns:
+        Dict representation of the xml file.
+    Examples:
+        >>> file_path = "data/file.xml"      # doctest: +SKIP
+        >>> result = xml(file_path)          # doctest: +SKIP
+    """
     with open(path, "rb") as fp:
         data = xmltodict.parse(fp.read())
@@ -159,6 +259,18 @@ _single_loaders: dict[str, SingleLoader] = {
 def data(path: FilePath) -> dict[str, Any]:
+    """Reads files of types json, toml, yaml, and yml and parses it into a dict representation
+    Args:
+        path: Path of the file
+    Returns:
+        Dict representation of the file.
+    Examples:
+        >>> yaml_file = "./data/cars-1k.yaml"
+        >>> result = data(yaml_file)
+    """
     if isinstance(path, str):
         path = Path(path)
@@ -170,6 +282,18 @@ def data(path: FilePath) -> dict[str, Any]:
 def path(path: FilePath, pattern: str | None = None) -> Casebase[Any, Any]:
+    """Converts a path into a Casebase. The path can be a folder or a file.
+    Args:
+        path: Path of the file.
+    Returns:
+        Returns a Casebase.
+    Examples:
+        >>> file_path = "./data/cars-1k.csv"
+        >>> result = path(file_path)
+    """
     if isinstance(path, str):
         path = Path(path)
@@ -189,6 +313,19 @@ def path(path: FilePath, pattern: str | None = None) -> Casebase[Any, Any]:
 def file(path: Path) -> Casebase[Any, Any] | None:
+    """Converts a file into a Casebase. The file can be of type csv, json, toml, yaml, or yml.
+    Args:
+        path: Path of the file.
+    Returns:
+        Returns a Casebase.
+    Examples:
+        >>> from pathlib import Path
+        >>> file_path = Path("./data/cars-1k.csv")
+        >>> result = file(file_path)
+    """
     if path.suffix not in _batch_loaders:
         return None
@@ -199,6 +336,20 @@ def file(path: Path) -> Casebase[Any, Any] | None:
 def folder(path: Path, pattern: str) -> Casebase[Any, Any] | None:
+    """Converts the files of a folder into a Casebase. The files can be of type txt, csv, json, toml, yaml, or yml.
+    Args:
+        path: Path of the folder.
+        pattern: Relative pattern for the files.
+    Returns:
+        Returns a Casebase.
+    Examples:
+        >>> from pathlib import Path
+        >>> folder_path = Path("./data")
+        >>> result = folder(folder_path, ".csv")
+    """
     cb: Casebase[Any, Any] = {}
     for file in path.glob(pattern):

{cbrkit-0.2.1 → cbrkit-0.3.1}/cbrkit/retrieval.py RENAMED Viewed

@@ -29,7 +29,7 @@ def _similarities2ranking(
     return sorted(sim_map, key=lambda key: unpack_sim(sim_map[key]), reverse=True)
-@dataclass
+@dataclass(slots=True)
 class _Result(Generic[KeyType, ValueType, SimType]):
     similarities: SimMap[KeyType, SimType]
     ranking: list[KeyType]
@@ -47,7 +47,7 @@ class _Result(Generic[KeyType, ValueType, SimType]):
         return cls(similarities=similarities, ranking=ranking, casebase=casebase)
-@dataclass
+@dataclass(slots=True)
 class Result(Generic[KeyType, ValueType, SimType]):
     final: _Result[KeyType, ValueType, SimType]
     intermediate: list[_Result[KeyType, ValueType, SimType]]
@@ -78,6 +78,40 @@ def apply(
     retrievers: RetrieveFunc[KeyType, ValueType, SimType]
     | Sequence[RetrieveFunc[KeyType, ValueType, SimType]],
 ) -> Result[KeyType, ValueType, SimType]:
+    """Applies a query to a Casebase using retriever functions.
+    Args:
+        casebase: The casebase for the query.
+        query: The query that will be applied to the casebase
+        retrievers: Retriever functions that will retrieve similar cases (compared to the query) from the casebase
+    Returns:
+        Returns an object of type Result.
+    Examples:
+        >>> import cbrkit
+        >>> import pandas as pd
+        >>> df = pd.read_csv("./data/cars-1k.csv")
+        >>> casebase = cbrkit.loaders.dataframe(df)
+        >>> query = casebase[42]
+        >>> retriever = cbrkit.retrieval.build(
+        ...     cbrkit.global_sim.attribute_value(
+        ...         attributes={
+        ...             "price": cbrkit.sim.numeric.linear(max=100000),
+        ...             "year": cbrkit.sim.numeric.linear(max=50),
+        ...             "manufacturer": cbrkit.sim.taxonomy.load(
+        ...                 "./data/cars-taxonomy.yaml",
+        ...                 measure=cbrkit.sim.taxonomy.wu_palmer(),
+        ...             ),
+        ...             "miles": cbrkit.sim.numeric.linear(max=1000000),
+        ...         },
+        ...         types_fallback=cbrkit.sim.generic.equality(),
+        ...         aggregator=cbrkit.global_sim.aggregator(pooling="mean"),
+        ...     ),
+        ...     limit=5,
+        ... )
+        >>> result = cbrkit.retrieval.apply(casebase, query, retriever)
+    """
     if not isinstance(retrievers, Sequence):
         retrievers = [retrievers]
@@ -99,6 +133,37 @@ def build(
     similarity_func: AnySimFunc[KeyType, ValueType, SimType],
     limit: int | None = None,
 ) -> RetrieveFunc[KeyType, ValueType, SimType]:
+    """Based on the similarity function this function creates a retriever function.
+    Args:
+        similarity_func: Similarity function to compute the similarity between cases.
+        limit: Retriever function will return the top limit cases.
+    Returns:
+        Returns the retriever function.
+    Examples:
+        >>> import cbrkit
+        >>> retriever = cbrkit.retrieval.build(
+        ...     cbrkit.global_sim.attribute_value(
+        ...         attributes={
+        ...             "price": cbrkit.sim.numeric.linear(max=100000),
+        ...             "year": cbrkit.sim.numeric.linear(max=50),
+        ...             "model": cbrkit.global_sim.attribute_value(
+        ...                 attributes={
+        ...                     "make": cbrkit.sim.generic.equality(),
+        ...                     "manufacturer": cbrkit.sim.taxonomy.load(
+        ...                         "./data/cars-taxonomy.yaml",
+        ...                         measure=cbrkit.sim.taxonomy.wu_palmer(),
+        ...                     ),
+        ...                 }
+        ...             ),
+        ...         },
+        ...         aggregator=cbrkit.global_sim.aggregator(pooling="mean"),
+        ...     ),
+        ...     limit=5,
+        ... )
+    """
     sim_func = sim2map(similarity_func)
     def wrapped_func(

{cbrkit-0.2.1 → cbrkit-0.3.1}/cbrkit/sim/_helpers.py RENAMED Viewed

@@ -26,12 +26,35 @@ __all__ = [
 def dist2sim(distance: float) -> float:
+    """Convert a distance to a similarity.
+    Args:
+        distance: The distance to convert
+    Examples:
+        >>> dist2sim(1.)
+        0.5
+    """
     return 1 / (1 + distance)
 def sim2seq(
     func: SimPairFunc[ValueType, SimType] | SimSeqFunc[ValueType, SimType],
 ) -> SimSeqFunc[ValueType, SimType]:
+    """
+    Converts a similarity function that operates on pairs of values into a similarity function that operates on sequences of values.
+    Args:
+        func: The similarity function to be converted.
+    Examples:
+        >>> def sim_func(x: int, y: int) -> float:
+        ...     return abs(x - y) / max(x, y)
+        ...
+        >>> seq_func = sim2seq(sim_func)
+        >>> seq_func([(1, 2), (3, 4), (5, 6)])
+        [0.5, 0.25, 0.16666666666666666]
+    """
     signature = inspect_signature(func)
     if len(signature.parameters) == 2:

{cbrkit-0.2.1 → cbrkit-0.3.1}/cbrkit/sim/collections.py RENAMED Viewed

@@ -4,8 +4,17 @@ from typing import Any
 from cbrkit.sim._helpers import dist2sim
 from cbrkit.typing import SimPairFunc
+__all__ = ["jaccard"]
 def jaccard() -> SimPairFunc[Collection[Any], float]:
+    """Jaccard similarity function.
+    Examples:
+        >>> sim = jaccard()
+        >>> sim(["a", "b", "c", "d"], ["a", "b", "c"])
+        0.8
+    """
     from nltk.metrics import jaccard_distance
     def wrapped_func(x: Collection[Any], y: Collection[Any]) -> float:

{cbrkit-0.2.1 → cbrkit-0.3.1}/cbrkit/sim/generic.py RENAMED Viewed

@@ -7,12 +7,29 @@ from cbrkit.typing import (
     ValueType,
 )
+__all__ = ["table", "equality"]
 def table(
     entries: Sequence[tuple[ValueType, ValueType, float]],
     symmetric: bool = True,
     default: float = 0.0,
 ) -> SimPairFunc[ValueType, float]:
+    """Allows to import a similarity values from a table.
+    Args:
+        entries: Sequence[tuple[a, b, sim(a, b)]
+        symmetric: If True, the table is assumed to be symmetric, i.e. sim(a, b) = sim(b, a)
+        default: Default similarity value for pairs not in the table
+    Examples:
+        >>> sim = table([("a", "b", 0.5), ("b", "c", 0.7)], symmetric=True, default=0.0)
+        >>> sim("b", "a")
+        0.5
+        >>> sim("a", "c")
+        0.0
+    """
     table: defaultdict[ValueType, defaultdict[ValueType, float]] = defaultdict(
         lambda: defaultdict(lambda: default)
     )
@@ -30,6 +47,16 @@ def table(
 def equality() -> SimPairFunc[Any, float]:
+    """Equality similarity function. Returns 1.0 if the two values are equal, 0.0 otherwise.
+    Examples:
+        >>> sim = equality()
+        >>> sim("b", "a")
+        0.0
+        >>> sim("a", "a")
+        1.0
+    """
     def wrapped_func(x: Any, y: Any) -> float:
         return 1.0 if x == y else 0.0

{cbrkit-0.2.1 → cbrkit-0.3.1}/cbrkit/sim/numeric.py RENAMED Viewed

@@ -15,6 +15,9 @@ def linear(max: float, min: float = 0.0) -> SimPairFunc[Number, float]:
         min: Minimum bound of the interval
     ![linear](../../assets/numeric/linear.png)
+    >>> sim = linear(100)
+    >>> sim(50, 60)
+    0.9
     """
     def wrapped_func(x: Number, y: Number) -> float:
@@ -37,6 +40,12 @@ def threshold(threshold: float) -> SimPairFunc[Number, float]:
         threshold: If the absolute difference between the two values is less than or equal to this value, the similarity is 1.0, otherwise it is 0.0
     ![threshold](../../assets/numeric/threshold.png)
+    Examples:
+        >>> sim = threshold(10)
+        >>> sim(50, 60)
+        1.0
+        >>> sim(50, 61)
+        0.0
     """
     def wrapped_func(x: Number, y: Number) -> float:
@@ -49,9 +58,13 @@ def exponential(alpha: float = 1.0) -> SimPairFunc[Number, float]:
     """Exponential similarity function.
     Args:
-        alpha: Controls the growth of the exponential function for the similarity. The larger alpha is, the faster the function grows.
+        alpha: Controls the growth of the exponential function for the similarity. The larger alpha is, the faster the similarity decreases.
     ![exponential](../../assets/numeric/exponential.png)
+    Examples:
+        >>> sim = exponential(0.1)
+        >>> sim(50, 60)
+        0.36787944117144233
     """
     def wrapped_func(x: Number, y: Number) -> float:
@@ -68,6 +81,12 @@ def sigmoid(alpha: float = 1.0, theta: float = 1.0) -> SimPairFunc[Number, float
         theta: Specifies the point at which the similarity value is 0.5.
     ![sigmoid](../../assets/numeric/sigmoid.png)
+    Examples:
+        >>> sim = sigmoid(1, 10)
+        >>> sim(50, 60)
+        0.5
+        >>> sim(50, 58)
+        0.8807970779778823
     """
     def wrapped_func(x: Number, y: Number) -> float:

{cbrkit-0.2.1 → cbrkit-0.3.1}/cbrkit/sim/strings.py RENAMED Viewed

@@ -12,8 +12,24 @@ from cbrkit.typing import (
     SimSeqFunc,
 )
+__all__ = [
+    "spacy",
+    "sentence_transformers",
+    "openai",
+    "levenshtein",
+    "jaro",
+    "jaro_winkler",
+    "table",
+]
 def _cosine(u, v) -> float:
+    """Cosine similarity between two vectors
+    Args:
+        u: First vector
+        v: Second vector
+    """
     import numpy as np
     import scipy.spatial.distance as scipy_dist
@@ -28,6 +44,11 @@ def _unique_items(pairs: Sequence[tuple[str, str]]) -> list[str]:
 def spacy(model_name: str = "en_core_web_lg") -> SimSeqFunc[str, float]:
+    """[spaCy](https://spacy.io/usage/linguistic-features/#vectors-similarity) based semantic similarity using word vectors. It calculates the similarity between given text pairs.
+    Args:
+        model_name: Name of the [spaCy model](https://spacy.io/usage/models) to use to generate word vectors. Defaults to "en_core_web_lg".
+    """
     from spacy import load as spacy_load
     nlp = spacy_load(model_name)
@@ -46,6 +67,11 @@ def spacy(model_name: str = "en_core_web_lg") -> SimSeqFunc[str, float]:
 def sentence_transformers(model_name: str) -> SimSeqFunc[str, float]:
+    """[Sentence-Transformers](https://www.sbert.net/) based semantic similarity using word vectors. It calculates the similarity between given text pairs.
+    Args:
+        model_name: Name of the [pretrained model](https://www.sbert.net/docs/pretrained_models.html) to use to generate word vectors. It calculates the cosine similarity between given text pairs.
+    """
     from sentence_transformers import SentenceTransformer
     model = SentenceTransformer(model_name)
@@ -61,6 +87,11 @@ def sentence_transformers(model_name: str) -> SimSeqFunc[str, float]:
 def openai(model_name: str) -> SimSeqFunc[str, float]:
+    """Semantic similarity using word vectors generated by one of OpenAI's embedding models. It calculates the cosine similarity between given text pairs.
+    Args:
+        model_name: Name of the [embedding model](https://platform.openai.com/docs/models/embeddings) to use to generate word vectors.
+    """
     import numpy as np
     from openai import Client
@@ -78,6 +109,18 @@ def openai(model_name: str) -> SimSeqFunc[str, float]:
 def levenshtein(score_cutoff: float | None = None) -> SimPairFunc[str, float]:
+    """Similarity function that calculates a normalized indel similarity between two strings based on [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance).
+    Args:
+        score_cutoff: If the similarity is less than this value, the function will return 0.0.
+    Examples:
+        >>> sim = levenshtein()
+        >>> sim("kitten", "sitting")
+        0.6153846153846154
+        >>> sim = levenshtein(score_cutoff=0.8)
+        >>> sim("kitten", "sitting")
+        0.0
+    """
     import Levenshtein
     def wrapped_func(x: str, y: str) -> float:
@@ -87,6 +130,18 @@ def levenshtein(score_cutoff: float | None = None) -> SimPairFunc[str, float]:
 def jaro(score_cutoff: float | None = None) -> SimPairFunc[str, float]:
+    """Jaro similarity function to compute similarity between two strings.
+    Args:
+        score_cutoff: If the similarity is less than this value, the function will return 0.0.
+    Examples:
+        >>> sim = jaro()
+        >>> sim("kitten", "sitting")
+        0.746031746031746
+        >>> sim = jaro(score_cutoff=0.8)
+        >>> sim("kitten", "sitting")
+        0.0
+    """
     import Levenshtein
     def wrapped_func(x: str, y: str) -> float:
@@ -96,8 +151,21 @@ def jaro(score_cutoff: float | None = None) -> SimPairFunc[str, float]:
 def jaro_winkler(
-    score_cutoff: float | None = None, prefix_weight: float | None = None
+    score_cutoff: float | None = None, prefix_weight: float = 0.1
 ) -> SimPairFunc[str, float]:
+    """Jaro-Winkler similarity function to compute similarity between two strings.
+    Args:
+        score_cutoff: If the similarity is less than this value, the function will return 0.0.
+        prefix_weight: Weight used for the common prefix of the two strings. Has to be between 0 and 0.25. Default is 0.1.
+    Examples:
+        >>> sim = jaro_winkler()
+        >>> sim("kitten", "sitting")
+        0.746031746031746
+        >>> sim = jaro_winkler(score_cutoff=0.8)
+        >>> sim("kitten", "sitting")
+        0.0
+    """
     import Levenshtein
     def wrapped_func(x: str, y: str) -> float:
@@ -113,6 +181,20 @@ def table(
     symmetric: bool = True,
     default: float = 0.0,
 ) -> SimPairFunc[str, float]:
+    """Allows to import a similarity values from a table.
+    Args:
+        entries: Sequence[tuple[a, b, sim(a, b)]
+        symmetric: If True, the table is assumed to be symmetric, i.e. sim(a, b) = sim(b, a)
+        default: Default similarity value for pairs not in the table
+    Examples:
+        >>> sim = table([("a", "b", 0.5), ("b", "c", 0.7)], symmetric=True, default=0.0)
+        >>> sim("b", "a")
+        0.5
+        >>> sim("a", "c")
+        0.0
+    """
     if isinstance(entries, FilePath):
         if isinstance(entries, str):
             entries = Path(entries)

{cbrkit-0.2.1 → cbrkit-0.3.1}/cbrkit/sim/taxonomy.py RENAMED Viewed

@@ -4,6 +4,8 @@ from typing import Optional, Protocol, TypedDict, cast
 from cbrkit.loaders import data as load_data
 from cbrkit.typing import FilePath, SimPairFunc
+__all__ = ["Taxonomy", "TaxonomyNode", "TaxonomyFunc", "load", "wu_palmer"]
 class SerializedNode(TypedDict, total=False):
     key: str
@@ -11,7 +13,7 @@ class SerializedNode(TypedDict, total=False):
     children: list["SerializedNode | str"]
-@dataclass
+@dataclass(slots=True)
 class TaxonomyNode:
     key: str
     weight: float | None
@@ -21,6 +23,8 @@ class TaxonomyNode:
 class Taxonomy:
+    __slots__ = ("root", "nodes")
     root: TaxonomyNode
     nodes: dict[str, TaxonomyNode]
@@ -74,6 +78,15 @@ class TaxonomyFunc(Protocol):
 def wu_palmer() -> TaxonomyFunc:
+    """Wu & Palmer similarity measure of two nodes in a taxonomy.
+    >>> taxonomy = Taxonomy("./data/cars-taxonomy.yaml")
+    >>> sim = wu_palmer()
+    >>> sim(taxonomy, "audi", "porsche")
+    0.5
+    >>> sim(taxonomy, "audi", "bmw")
+    0.0
+    """
     def wrapped_func(taxonomy: Taxonomy, x: str, y: str) -> float:
         node1 = taxonomy.nodes[x]
         node2 = taxonomy.nodes[y]
@@ -90,6 +103,13 @@ _taxonomy_func = wu_palmer()
 def load(
     path: FilePath, measure: TaxonomyFunc = _taxonomy_func
 ) -> SimPairFunc[str, float]:
+    """Load a taxonomy and return a function that measures the similarity.
+    >>> sim = load("./data/cars-taxonomy.yaml", measure=wu_palmer())
+    >>> sim("audi", "porsche")
+    0.5
+    >>> sim("audi", "bmw")
+    0.0
+    """
     taxonomy = Taxonomy(path)
     def wrapped_func(x: str, y: str) -> float:

{cbrkit-0.2.1 → cbrkit-0.3.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "cbrkit"
-version = "0.2.1"
+version = "0.3.1"
 description = "Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI."
 authors = ["Mirko Lenz <mirko@mirkolenz.com>"]
 license = "MIT"
@@ -40,53 +40,47 @@ cbrkit = "cbrkit.cli:app"
 [tool.poetry.dependencies]
 python = ">=3.11, <3.13"
+fastapi = { version = ">=0.100, <1.0", optional = true, extras = ["all"] }
+levenshtein = { version = ">=0.23, <1.0", optional = true }
+nltk = { version = "^3.8", optional = true }
+openai = { version = "^1.5", optional = true }
+orjson = "^3.9"
 pandas = "^2.1"
+pyarrow = ">=13.0"
 pyyaml = "^6.0"
-orjson = "^3.9"
-xmltodict = "^0.13"
-pyarrow = "^14.0"
-typer = { version = "^0.9", extras = ["all"], optional = true }
-fastapi = { version = "^0.104", optional = true, extras = ["all"] }
-uvicorn = { version = "^0.24", optional = true, extras = ["standard"] }
-spacy = { version = "^3.7", optional = true }
-nltk = { version = "^3.8", optional = true }
-levenshtein = { version = "^0.23", optional = true }
 sentence-transformers = { version = "^2.2", optional = true }
-openai = { version = "^1.3", optional = true }
+spacy = { version = "^3.7", optional = true }
 torch = { version = "^2.1.1", optional = true }
-transformers = { version = "^4.36", optional = true }
+transformers = { version = "^4.35", optional = true }
+typer = { version = "^0.9", extras = ["all"], optional = true }
+uvicorn = { version = ">=0.24, <1.0", optional = true, extras = ["standard"] }
+xmltodict = "^0.13"
 [tool.poetry.group.dev.dependencies]
-pytest = "^7.4"
+pytest = "^8.0.0"
 pytest-cov = "^4.1"
 [tool.poetry.group.docs.dependencies]
-pdoc = "^14.1"
+pdoc = "^14.4"
 [tool.poetry.extras]
 all = [
-    "typer",
     "fastapi",
-    "uvicorn",
-    "spacy",
-    "nltk",
     "levenshtein",
-    "sentence-transformers",
+    "nltk",
     "openai",
+    "sentence-transformers",
+    "spacy",
+    "spacy",
     "torch",
     "transformers",
+    "typer",
+    "uvicorn",
 ]
 cli = ["typer"]
 api = ["fastapi", "uvicorn"]
-nlp = [
-    "spacy",
-    "nltk",
-    "levenshtein",
-    "sentence-transformers",
-    "openai",
-    "torch",
-    "transformers",
-]
+nlp = ["levenshtein", "nltk", "openai", "spacy"]
+transformers = ["sentence-transformers", "torch", "transformers"]
 [tool.pytest.ini_options]
 addopts = "--cov cbrkit --cov-report term-missing --doctest-modules --ignore cbrkit/cli.py --ignore cbrkit/api.py --ignore result"