PyPI - cbrkit - Versions diffs - 0.26.2__tar.gz → 0.26.4__tar.gz - Mend

cbrkit 0.26.2tar.gz → 0.26.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

{cbrkit-0.26.2 → cbrkit-0.26.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: cbrkit
-Version: 0.26.2
+Version: 0.26.4
 Summary: Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI
 Keywords: cbr,case-based reasoning,api,similarity,nlp,retrieval,cli,tool,library
 Author: Mirko Lenz

{cbrkit-0.26.2 → cbrkit-0.26.4}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "cbrkit"
-version = "0.26.2"
+version = "0.26.4"
 description = "Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI"
 authors = [{ name = "Mirko Lenz", email = "mirko@mirkolenz.com" }]
 readme = "README.md"

{cbrkit-0.26.2 → cbrkit-0.26.4}/src/cbrkit/retrieval/build.py RENAMED Viewed

@@ -1,5 +1,5 @@
 import itertools
-from collections.abc import Sequence
+from collections.abc import Mapping, Sequence
 from dataclasses import dataclass
 from multiprocessing.pool import Pool
 from typing import Literal, override
@@ -129,14 +129,18 @@ class combine[K, V, S: Float](RetrieverFunc[K, V, float]):
         retriever_funcs: A list of retriever functions to be combined.
         aggregator: A function to aggregate the results from the retriever functions.
         strategy: The strategy to combine the results. Either "intersection" or "union".
+        default_sim: The default similarity value to use for strategy "union" when a case is not found in one of the retriever results.
     Returns:
         A retriever function that combines the results from multiple retrievers.
     """
-    retriever_funcs: Sequence[RetrieverFunc[K, V, S]]
-    aggregator: AggregatorFunc[str, S] = default_aggregator
+    retriever_funcs: (
+        Sequence[RetrieverFunc[K, V, S]] | Mapping[str, RetrieverFunc[K, V, S]]
+    )
+    aggregator: AggregatorFunc[str, S | float] = default_aggregator
     strategy: Literal["intersection", "union"] = "union"
+    default_sim: float = 0.0
     @override
     def __call__(
@@ -154,42 +158,65 @@ class combine[K, V, S: Float](RetrieverFunc[K, V, float]):
                 for batch_idx in range(len(batches))
             ]
-        # elif isinstance(self.retriever_funcs, Mapping):
-        #     results = {
-        #         func_key: retriever_func(batches)
-        #         for func_key, retriever_func in self.retriever_funcs.items()
-        #     }
+        elif isinstance(self.retriever_funcs, Mapping):
+            results = {
+                func_key: retriever_func(batches)
+                for func_key, retriever_func in self.retriever_funcs.items()
+            }
-        #     return [
-        #         self.__call_batch__(
-        #             {func_key: func_results[batch_idx] for func_key, func_results in results.items()}
-        #         )
-        #         for batch_idx in range(len(batches))
-        #     ]
+            return [
+                self.__call_batch__(
+                    {
+                        func_key: func_results[batch_idx]
+                        for func_key, func_results in results.items()
+                    }
+                )
+                for batch_idx in range(len(batches))
+            ]
         raise ValueError(f"Invalid retriever_funcs type: {type(self.retriever_funcs)}")
-    def __call_batch__(self, results: Sequence[SimMap[K, S]]) -> SimMap[K, float]:
-        if self.strategy == "intersection":
+    def __call_batch__(
+        self, results: Sequence[SimMap[K, S]] | Mapping[str, SimMap[K, S]]
+    ) -> SimMap[K, float]:
+        case_keys: set[K]
+        if isinstance(results, Sequence):
+            if self.strategy == "intersection":
+                case_keys = set().intersection(*(result.keys() for result in results))
+            elif self.strategy == "union":
+                case_keys = set().union(*(result.keys() for result in results))
+            else:
+                raise ValueError(f"Unknown strategy: {self.strategy}")
             return {
                 case_key: self.aggregator(
-                    [result[case_key] for result in results if case_key in result]
-                )
-                for case_key in set().intersection(
-                    *[set(result.keys()) for result in results]
+                    [result.get(case_key, self.default_sim) for result in results]
                 )
+                for case_key in case_keys
             }
-        elif self.strategy == "union":
+        elif isinstance(results, Mapping):
+            if self.strategy == "intersection":
+                case_keys = set().intersection(
+                    *(result.keys() for result in results.values())
+                )
+            elif self.strategy == "union":
+                case_keys = set().union(*(result.keys() for result in results.values()))
+            else:
+                raise ValueError(f"Unknown strategy: {self.strategy}")
             return {
                 case_key: self.aggregator(
-                    [result[case_key] for result in results if case_key in result]
+                    {
+                        func_key: result.get(case_key, self.default_sim)
+                        for func_key, result in results.items()
+                    }
                 )
-                for result in results
-                for case_key in result.keys()
+                for case_key in case_keys
             }
-        raise ValueError(f"Unknown strategy: {self.strategy}")
+        raise ValueError(f"Invalid results type: {type(results)}")
 @dataclass(slots=True, frozen=True)

{cbrkit-0.26.2 → cbrkit-0.26.4}/src/cbrkit/retrieval/rerank.py RENAMED Viewed

@@ -311,12 +311,15 @@ with optional_dependencies():
                 k=len(casebase),
             )
             max_score = np.max(scores)
+            min_score = np.min(scores)
             key_index = {idx: key for idx, key in enumerate(casebase)}
             return [
                 {
-                    key_index[case_id]: float(score / max_score)
+                    key_index[case_id]: float(
+                        (score - min_score) / (max_score - min_score)
+                    )
                     for case_id, score in zip(
                         results[query_id], scores[query_id], strict=True
                     )

{cbrkit-0.26.2 → cbrkit-0.26.4}/src/cbrkit/sim/graphs/astar.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import heapq
-from collections.abc import Mapping
+from collections.abc import Collection, Mapping
 from dataclasses import dataclass, field
-from typing import Protocol
+from typing import Any, Protocol, cast
 from ...helpers import (
     get_logger,
@@ -34,6 +34,33 @@ __all__ = [
 logger = get_logger(__name__)
+def next_elem[K](elements: Collection[K]) -> K:
+    """Select the next element from a set deterministically.
+    If elements are sortable, returns the smallest one.
+    Otherwise, returns the first element from iteration.
+    Args:
+        elements: Set of elements to choose from
+    Returns:
+        A single element from the set
+    Raises:
+        ValueError: If the set is empty
+    """
+    if not elements:
+        raise ValueError("Cannot select from empty set")
+    if len(elements) == 1:
+        return next(iter(elements))
+    try:
+        return min(cast(Collection[Any], elements))
+    except TypeError:
+        return next(iter(elements))
 @dataclass(slots=True, frozen=True, order=True)
 class PriorityState[K]:
     priority: float
@@ -60,7 +87,6 @@ class SelectionFunc[K, N, E, G](Protocol):
         s: SearchState[K],
         node_pair_sims: Mapping[tuple[K, K], float],
         edge_pair_sims: Mapping[tuple[K, K], float],
-        heuristic_func: HeuristicFunc[K, N, E, G],
         /,
     ) -> None | tuple[K, GraphElementType]: ...
@@ -127,7 +153,7 @@ class h3[K, N, E, G](HeuristicFunc[K, N, E, G]):
                 default=0.0,
             )
-        def mapping_possible(x_node: Node[K, N], y_node: Node[K, N]) -> bool:
+        def can_map(x_node: Node[K, N], y_node: Node[K, N]) -> bool:
             return x_node.key == s.node_mapping.get(y_node.key) or (
                 y_node.key in s.open_y_nodes and x_node.key in s.open_x_nodes
             )
@@ -137,8 +163,8 @@ class h3[K, N, E, G](HeuristicFunc[K, N, E, G]):
                 (
                     edge_pair_sims.get((y_key, x_key), 0.0)
                     for x_key in s.open_x_edges
-                    if mapping_possible(x.edges[x_key].source, y.edges[y_key].source)
-                    and mapping_possible(x.edges[x_key].target, y.edges[y_key].target)
+                    if can_map(x.edges[x_key].source, y.edges[y_key].source)
+                    and can_map(x.edges[x_key].target, y.edges[y_key].target)
                 ),
                 default=0.0,
             )
@@ -155,19 +181,14 @@ class select1[K, N, E, G](SelectionFunc[K, N, E, G]):
         s: SearchState[K],
         node_pair_sims: Mapping[tuple[K, K], float],
         edge_pair_sims: Mapping[tuple[K, K], float],
-        heuristic_func: HeuristicFunc[K, N, E, G],
     ) -> None | tuple[K, GraphElementType]:
         """Select the next node or edge to be mapped"""
-        try:
-            return next(iter(s.open_y_nodes)), "node"
-        except StopIteration:
-            pass
+        if s.open_y_nodes:
+            return next_elem(s.open_y_nodes), "node"
-        try:
-            return next(iter(s.open_y_edges)), "edge"
-        except StopIteration:
-            pass
+        if s.open_y_edges:
+            return next_elem(s.open_y_edges), "edge"
         return None
@@ -181,24 +202,21 @@ class select2[K, N, E, G](SelectionFunc[K, N, E, G]):
         s: SearchState[K],
         node_pair_sims: Mapping[tuple[K, K], float],
         edge_pair_sims: Mapping[tuple[K, K], float],
-        heuristic_func: HeuristicFunc[K, N, E, G],
     ) -> None | tuple[K, GraphElementType]:
         """Select the next node or edge to be mapped"""
-        try:
-            return next(
-                key
-                for key in s.open_y_edges
-                if y.edges[key].source.key not in s.open_y_nodes
-                and y.edges[key].target.key not in s.open_y_nodes
-            ), "edge"
-        except StopIteration:
-            pass
+        edge_candidates = {
+            key
+            for key in s.open_y_edges
+            if y.edges[key].source.key not in s.open_y_nodes
+            and y.edges[key].target.key not in s.open_y_nodes
+        }
-        try:
-            return next(iter(s.open_y_nodes)), "node"
-        except StopIteration:
-            pass
+        if edge_candidates:
+            return next_elem(edge_candidates), "edge"
+        if s.open_y_nodes:
+            return next_elem(s.open_y_nodes), "node"
         return None
@@ -212,36 +230,75 @@ class select3[K, N, E, G](SelectionFunc[K, N, E, G]):
         s: SearchState[K],
         node_pair_sims: Mapping[tuple[K, K], float],
         edge_pair_sims: Mapping[tuple[K, K], float],
-        heuristic_func: HeuristicFunc[K, N, E, G],
     ) -> None | tuple[K, GraphElementType]:
         """Select the next node or edge to be mapped"""
-        heuristic_scores: list[tuple[K, GraphElementType, float]] = []
+        mapping_options: dict[tuple[K, GraphElementType], int] = {}
+        heuristic_scores: dict[tuple[K, GraphElementType], float] = {}
         for y_key in s.open_y_nodes:
-            heuristic_scores.append(
-                (
-                    y_key,
-                    "node",
-                    heuristic_func(x, y, s, node_pair_sims, edge_pair_sims),
-                )
+            h_vals = [
+                node_pair_sims[(y_key, x_key)]
+                for x_key in s.open_x_nodes
+                if (y_key, x_key) in node_pair_sims
+            ]
+            if h_vals:
+                mapping_options[(y_key, "node")] = len(h_vals)
+                heuristic_scores[(y_key, "node")] = max(h_vals)
+        def can_map(x_node: Node[K, N], y_node: Node[K, N]) -> bool:
+            return x_node.key == s.node_mapping.get(y_node.key) or (
+                y_node.key in s.open_y_nodes and x_node.key in s.open_x_nodes
             )
         for y_key in s.open_y_edges:
-            heuristic_scores.append(
-                (
-                    y_key,
-                    "edge",
-                    heuristic_func(x, y, s, node_pair_sims, edge_pair_sims),
-                )
-            )
+            h_vals = [
+                edge_pair_sims[(y_key, x_key)]
+                for x_key in s.open_x_edges
+                if (y_key, x_key) in edge_pair_sims
+                and can_map(x.edges[x_key].source, y.edges[y_key].source)
+                and can_map(x.edges[x_key].target, y.edges[y_key].target)
+            ]
+            if h_vals:
+                mapping_options[(y_key, "edge")] = len(h_vals)
+                heuristic_scores[(y_key, "edge")] = max(h_vals)
         if not heuristic_scores:
+            # Fallback: select any remaining node or edge for null mapping
+            # Use sorted to ensure deterministic selection
+            if s.open_y_nodes:
+                return next_elem(s.open_y_nodes), "node"
+            elif s.open_y_edges:
+                return next_elem(s.open_y_edges), "edge"
             return None
-        best_selection = max(heuristic_scores, key=lambda x: x[2])
+        # Find the maximum heuristic score
+        max_score = max(heuristic_scores.values())
+        best_selections = {
+            key for key, value in heuristic_scores.items() if value == max_score
+        }
-        selection_key, selection_type, _ = best_selection
+        # if multiple selections have the same score, select the one with the lowest number of possible mappings
+        if len(best_selections) > 1:
+            min_mapping_options = min(mapping_options[key] for key in best_selections)
+            best_selections = {
+                key
+                for key in best_selections
+                if mapping_options[key] == min_mapping_options
+            }
+        # select the one with the lowest key
+        try:
+            best_selection = min(
+                best_selections,
+                key=lambda item: cast(Any, item[0]),
+            )
+        except TypeError:
+            best_selection = next(iter(best_selections))
+        selection_key, selection_type = best_selection
         if selection_type == "edge":
             edge = y.edges[selection_key]
@@ -271,7 +328,7 @@ class build[K, N, E, G](
         beam_width: Limits the queue size which prunes the search space.
             This leads to a faster search and less memory usage but also introduces a similarity error.
             Disabled by default. Based on [Neuhaus et al. (2006)](https://doi.org/10.1007/11815921_17).
-        pathlength_weight: Add a penalty for states with few mapped elements that already have a low similarity.
+        pathlength_weight: Favor long partial edit paths over shorter ones.
             Disabled by default. Based on [Neuhaus et al. (2006)](https://doi.org/10.1007/11815921_17).
     Returns:
@@ -300,7 +357,6 @@ class build[K, N, E, G](
             state,
             node_pair_sims,
             edge_pair_sims,
-            self.heuristic_func,
         )
         if selection is None:
@@ -338,22 +394,11 @@ class build[K, N, E, G](
         prio = 1 - (past_sim + future_sim)
         if self.pathlength_weight > 0:
-            node_null_mapping = (
-                set(y.nodes.keys())
-                - set(state.node_mapping.keys())
-                - set(state.open_y_nodes)
-            )
-            edge_null_mapping = (
-                set(y.edges.keys())
-                - set(state.edge_mapping.keys())
-                - set(state.open_y_edges)
-            )
-            num_paths = (
-                len(state.node_mapping)
-                + len(state.edge_mapping)
-                + len(node_null_mapping)
-                + len(edge_null_mapping)
-            )
+            # Calculate the number of mapping decisions made so far (partial edit path length)
+            # This includes actual mappings plus null mappings (elements processed but not mapped)
+            total_y_elements = len(y.nodes) + len(y.edges)
+            open_y_elements = len(state.open_y_nodes) + len(state.open_y_edges)
+            num_paths = total_y_elements - open_y_elements
             return prio / (self.pathlength_weight**num_paths)
         return prio
@@ -367,11 +412,33 @@ class build[K, N, E, G](
         open_set: list[PriorityState[K]] = []
         best_state = self.init_search_state(x, y)
+        # best_similarity = self.similarity(
+        #     x,
+        #     y,
+        #     best_state.node_mapping,
+        #     best_state.edge_mapping,
+        #     node_pair_sims,
+        #     edge_pair_sims,
+        # )
         heapq.heappush(open_set, PriorityState(0, best_state))
         while open_set:
             first_elem = heapq.heappop(open_set)
             current_state = first_elem.state
+            # current_similarity = self.similarity(
+            #     x,
+            #     y,
+            #     current_state.node_mapping,
+            #     current_state.edge_mapping,
+            #     node_pair_sims,
+            #     edge_pair_sims,
+            # )
+            # not needed because we add null mappings and
+            # the first item of the queue is always the best one
+            # if current_similarity.value > best_similarity.value:
+            #     best_state = current_state
+            #     best_similarity = current_similarity
             if self.finished(current_state):
                 best_state = current_state
@@ -392,7 +459,8 @@ class build[K, N, E, G](
                 heapq.heappush(open_set, PriorityState(next_prio, next_state))
             if self.beam_width > 0 and len(open_set) > self.beam_width:
-                open_set = open_set[: self.beam_width]
+                open_set = heapq.nsmallest(self.beam_width, open_set)
+                heapq.heapify(open_set)
         return self.similarity(
             x,

{cbrkit-0.26.2 → cbrkit-0.26.4}/src/cbrkit/sim/graphs/brute_force.py RENAMED Viewed

@@ -74,8 +74,16 @@ class brute_force[K, N, E, G](
                     if next_sim and (
                         next_sim.value > best_sim.value
-                        or len(next_sim.node_mapping) > len(best_sim.node_mapping)
-                        or len(next_sim.edge_mapping) > len(best_sim.edge_mapping)
+                        or (
+                            next_sim.value >= best_sim.value
+                            and (
+                                len(next_sim.node_mapping) > len(best_sim.node_mapping)
+                                or (
+                                    len(next_sim.edge_mapping)
+                                    > len(best_sim.edge_mapping)
+                                )
+                            )
+                        )
                     ):
                         best_sim = next_sim

{cbrkit-0.26.2 → cbrkit-0.26.4}/src/cbrkit/sim/graphs/common.py RENAMED Viewed

@@ -287,16 +287,18 @@ class init_unique_matches[K, N, E, G](SearchStateInit[K, N, E, G]):
         edge_matcher: ElementMatcher[E],
     ) -> SearchState[K]:
         # pre-populate the mapping with nodes/edges that only have one possible legal mapping
-        possible_node_mappings: defaultdict[K, set[K]] = defaultdict(set)
+        y2x_map: defaultdict[K, set[K]] = defaultdict(set)
+        x2y_map: defaultdict[K, set[K]] = defaultdict(set)
         for y_key, x_key in itertools.product(y.nodes.keys(), x.nodes.keys()):
             if node_matcher(x.nodes[x_key].value, y.nodes[y_key].value):
-                possible_node_mappings[y_key].add(x_key)
+                y2x_map[y_key].add(x_key)
+                x2y_map[x_key].add(y_key)
         node_mapping: frozendict[K, K] = frozendict(
             (y_key, next(iter(x_keys)))
-            for y_key, x_keys in possible_node_mappings.items()
-            if len(x_keys) == 1
+            for y_key, x_keys in y2x_map.items()
+            if len(x_keys) == 1 and len(x2y_map[next(iter(x_keys))]) == 1
         )
         edge_mapping: frozendict[K, K] = _induced_edge_mapping(