cbrkit 0.12.2__tar.gz → 0.13.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cbrkit
3
- Version: 0.12.2
3
+ Version: 0.13.0
4
4
  Summary: Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI.
5
5
  Home-page: https://wi2trier.github.io/cbrkit/
6
6
  License: MIT
@@ -299,9 +299,9 @@ result = cbrkit.retrieval.apply(casebase, query, (retriever1, retriever2))
299
299
  The result has the following two attributes:
300
300
 
301
301
  - `final`: Result of the last retriever in the list.
302
- - `intermediates`: A list of results for each retriever in the list.
302
+ - `steps`: A list of results for each retriever in the list.
303
303
 
304
- Both `final` and each entry in `intermediates` have the same attributes as discussed previously.
304
+ Both `final` and each entry in `steps` have the same attributes as discussed previously.
305
305
  The returned result also has these entries which are an alias for the corresponding entries in `final` (i.e., `result.ranking == result.final.ranking`).
306
306
 
307
307
  ## REST API and CLI
@@ -244,9 +244,9 @@ result = cbrkit.retrieval.apply(casebase, query, (retriever1, retriever2))
244
244
  The result has the following two attributes:
245
245
 
246
246
  - `final`: Result of the last retriever in the list.
247
- - `intermediates`: A list of results for each retriever in the list.
247
+ - `steps`: A list of results for each retriever in the list.
248
248
 
249
- Both `final` and each entry in `intermediates` have the same attributes as discussed previously.
249
+ Both `final` and each entry in `steps` have the same attributes as discussed previously.
250
250
  The returned result also has these entries which are an alias for the corresponding entries in `final` (i.e., `result.ranking == result.final.ranking`).
251
251
 
252
252
  ## REST API and CLI
@@ -1,5 +1,8 @@
1
1
  from typing import Any
2
2
 
3
+ from pydantic import ConfigDict
4
+ from pydantic.dataclasses import dataclass
5
+
3
6
  try:
4
7
  from fastapi import FastAPI
5
8
  from pydantic_settings import BaseSettings, SettingsConfigDict
@@ -9,6 +12,10 @@ except ModuleNotFoundError:
9
12
 
10
13
  import cbrkit
11
14
 
15
+ ApiResult = dataclass(
16
+ cbrkit.retrieval.Result, config=ConfigDict(arbitrary_types_allowed=True)
17
+ )
18
+
12
19
 
13
20
  class Settings(BaseSettings):
14
21
  model_config = SettingsConfigDict(env_prefix="cbrkit_")
@@ -33,7 +40,7 @@ elif settings.retriever_map is not None:
33
40
  retriever = list(retriever_map.values())
34
41
 
35
42
 
36
- @app.post("/retrieve", response_model=None)
43
+ @app.post("/retrieve", response_model=dict[str, ApiResult])
37
44
  def all_retrievers(
38
45
  casebase: dict[str, Any], queries: dict[str, Any]
39
46
  ) -> dict[str, cbrkit.retrieval.Result]:
@@ -43,7 +50,7 @@ def all_retrievers(
43
50
  }
44
51
 
45
52
 
46
- @app.post("/retrieve/{retriever_name}", response_model=None)
53
+ @app.post("/retrieve/{retriever_name}", response_model=dict[str, ApiResult])
47
54
  def named_retriever(
48
55
  retriever_name: str, casebase: dict[str, Any], queries: dict[str, Any]
49
56
  ) -> dict[str, cbrkit.retrieval.Result]:
@@ -2,6 +2,7 @@
2
2
  .. include:: ../cli.md
3
3
  """
4
4
 
5
+ import json
5
6
  import os
6
7
  import sys
7
8
  from pathlib import Path
@@ -26,21 +27,34 @@ def app_callback():
26
27
  pass
27
28
 
28
29
 
30
+ # py -m cbrkit retrieve data/cars-1k.csv data/cars-queries.csv examples.cars_retriever:retriever --output-path data/output.json
29
31
  @app.command()
30
32
  def retrieve(
31
33
  casebase_path: Path,
32
34
  queries_path: Path,
33
35
  retriever: str,
36
+ search_path: Annotated[list[Path], typer.Option(default_factory=list)],
34
37
  print_ranking: bool = True,
38
+ output_path: Path | None = None,
39
+ processes: int = 1,
35
40
  ) -> None:
41
+ sys.path.extend(str(x) for x in search_path)
36
42
  casebase = cbrkit.loaders.path(casebase_path)
37
43
  queries = cbrkit.loaders.path(queries_path)
38
44
  retrievers = cbrkit.retrieval.load(retriever)
39
45
 
40
- for query_name, query in queries.items():
41
- result = cbrkit.retrieval.apply(casebase, query, retrievers)
46
+ results = cbrkit.retrieval.mapply(casebase, queries, retrievers, processes)
47
+
48
+ if output_path:
49
+ results_dict = {
50
+ query_name: result.as_dict() for query_name, result in results.items()
51
+ }
52
+
53
+ with output_path.with_suffix(".json").open("w") as fp:
54
+ json.dump(results_dict, fp, indent=2)
42
55
 
43
- if print_ranking:
56
+ if print_ranking:
57
+ for query_name, result in results.items():
44
58
  print(f"Query: {query_name}")
45
59
  print(result.ranking)
46
60
  print()
@@ -1,4 +1,3 @@
1
- from abc import ABC
2
1
  from collections.abc import Collection, Iterable, Mapping, Sequence
3
2
  from inspect import signature as inspect_signature
4
3
  from typing import Any, cast
@@ -21,7 +20,6 @@ __all__ = [
21
20
  "sim2map",
22
21
  "unpack_sim",
23
22
  "unpack_sims",
24
- "AbstractFloat",
25
23
  "singleton",
26
24
  ]
27
25
 
@@ -156,8 +154,3 @@ def unpack_sim(sim: AnyFloat) -> float:
156
154
 
157
155
  def unpack_sims(sims: Iterable[AnyFloat]) -> list[float]:
158
156
  return [unpack_sim(sim) for sim in sims]
159
-
160
-
161
- class AbstractFloat(ABC, float):
162
- def __new__(cls, *args, **kwargs):
163
- return float.__new__(cls, args[0])
@@ -4,7 +4,6 @@ This module provides several loaders to read data from different file formats an
4
4
 
5
5
  import csv as csvlib
6
6
  import tomllib
7
- from collections import abc
8
7
  from collections.abc import Callable, Iterator, Mapping
9
8
  from importlib import import_module
10
9
  from pathlib import Path
@@ -58,7 +57,7 @@ def python(import_name: str) -> Any:
58
57
  return getattr(module, obj_name)
59
58
 
60
59
 
61
- class DataFrameCasebase(abc.Mapping):
60
+ class DataFrameCasebase(Mapping):
62
61
  __slots__ = ("df",)
63
62
 
64
63
  df: DataFrame
@@ -1,5 +1,6 @@
1
+ import multiprocessing as mp
1
2
  from collections.abc import Callable, Collection, Mapping, Sequence
2
- from dataclasses import dataclass
3
+ from dataclasses import asdict, dataclass
3
4
  from typing import Any, Generic
4
5
 
5
6
  from cbrkit.helpers import sim2map, unpack_sim
@@ -16,11 +17,12 @@ from cbrkit.typing import (
16
17
 
17
18
  __all__ = [
18
19
  "build",
20
+ "mapply",
19
21
  "apply",
20
22
  "load",
21
23
  "load_map",
22
24
  "Result",
23
- "SingleResult",
25
+ "ResultStep",
24
26
  ]
25
27
 
26
28
 
@@ -31,7 +33,7 @@ def _similarities2ranking(
31
33
 
32
34
 
33
35
  @dataclass(slots=True)
34
- class SingleResult(Generic[KeyType, ValueType, SimType]):
36
+ class ResultStep(Generic[KeyType, ValueType, SimType]):
35
37
  similarities: SimMap[KeyType, SimType]
36
38
  ranking: list[KeyType]
37
39
  casebase: Casebase[KeyType, ValueType]
@@ -41,24 +43,32 @@ class SingleResult(Generic[KeyType, ValueType, SimType]):
41
43
  cls,
42
44
  similarities: SimMap[KeyType, SimType],
43
45
  full_casebase: Casebase[KeyType, ValueType],
44
- ) -> "SingleResult[KeyType, ValueType, SimType]":
46
+ ) -> "ResultStep[KeyType, ValueType, SimType]":
45
47
  ranking = _similarities2ranking(similarities)
46
48
  casebase = {key: full_casebase[key] for key in ranking}
47
49
 
48
50
  return cls(similarities=similarities, ranking=ranking, casebase=casebase)
49
51
 
52
+ def as_dict(self) -> dict[str, Any]:
53
+ x = asdict(self)
54
+ del x["casebase"]
55
+
56
+ return x
57
+
50
58
 
51
59
  @dataclass(slots=True)
52
60
  class Result(Generic[KeyType, ValueType, SimType]):
53
- final: SingleResult[KeyType, ValueType, SimType]
54
- intermediates: list[SingleResult[KeyType, ValueType, SimType]]
61
+ steps: list[ResultStep[KeyType, ValueType, SimType]]
55
62
 
56
63
  def __init__(
57
64
  self,
58
- results: list[SingleResult[KeyType, ValueType, SimType]],
65
+ steps: list[ResultStep[KeyType, ValueType, SimType]],
59
66
  ) -> None:
60
- self.final = results[-1]
61
- self.intermediates = results
67
+ self.steps = steps
68
+
69
+ @property
70
+ def final(self) -> ResultStep[KeyType, ValueType, SimType]:
71
+ return self.steps[-1]
62
72
 
63
73
  @property
64
74
  def similarities(self) -> SimMap[KeyType, SimType]:
@@ -72,6 +82,51 @@ class Result(Generic[KeyType, ValueType, SimType]):
72
82
  def casebase(self) -> Casebase[KeyType, ValueType]:
73
83
  return self.final.casebase
74
84
 
85
+ def as_dict(self) -> dict[str, Any]:
86
+ x = asdict(self)
87
+
88
+ for entry in x["steps"]:
89
+ del entry["casebase"]
90
+
91
+ return x
92
+
93
+
94
+ def mapply(
95
+ casebase: Casebase[KeyType, ValueType],
96
+ queries: Mapping[KeyType, ValueType],
97
+ retrievers: SimMapFunc[KeyType, ValueType, SimType]
98
+ | Sequence[SimMapFunc[KeyType, ValueType, SimType]],
99
+ processes: int = 1,
100
+ ) -> Mapping[KeyType, Result[KeyType, ValueType, SimType]]:
101
+ """Applies multiple queries to a Casebase using retriever functions.
102
+
103
+ Args:
104
+ casebase: The casebase for the query.
105
+ queries: The queries that will be applied to the casebase
106
+ retrievers: Retriever functions that will retrieve similar cases (compared to the query) from the casebase
107
+ processes: Number of CPUs that will be used for multiprocessing.
108
+ If 1, a regular loop will be used.
109
+ If 0, the number of processes will be equal to the number of CPUs.
110
+ Negative values will be treated as 0.
111
+
112
+ Returns:
113
+ Returns an object of type Result.
114
+ """
115
+
116
+ if processes != 1:
117
+ pool_processes = None if processes <= 0 else processes
118
+
119
+ with mp.Pool(pool_processes) as pool:
120
+ return {
121
+ key: pool.apply(
122
+ apply,
123
+ args=(casebase, value, retrievers),
124
+ )
125
+ for key, value in queries.items()
126
+ }
127
+
128
+ return {key: apply(casebase, value, retrievers) for key, value in queries.items()}
129
+
75
130
 
76
131
  def apply(
77
132
  casebase: Casebase[KeyType, ValueType],
@@ -79,7 +134,7 @@ def apply(
79
134
  retrievers: SimMapFunc[KeyType, ValueType, SimType]
80
135
  | Sequence[SimMapFunc[KeyType, ValueType, SimType]],
81
136
  ) -> Result[KeyType, ValueType, SimType]:
82
- """Applies a query to a Casebase using retriever functions.
137
+ """Applies a single query to a Casebase using retriever functions.
83
138
 
84
139
  Args:
85
140
  casebase: The casebase for the query.
@@ -117,12 +172,12 @@ def apply(
117
172
  retrievers = [retrievers]
118
173
 
119
174
  assert len(retrievers) > 0
120
- results: list[SingleResult[KeyType, ValueType, SimType]] = []
175
+ results: list[ResultStep[KeyType, ValueType, SimType]] = []
121
176
  current_casebase = casebase
122
177
 
123
178
  for retriever_func in retrievers:
124
179
  sim_map = retriever_func(current_casebase, query)
125
- result = SingleResult.build(sim_map, current_casebase)
180
+ result = ResultStep.build(sim_map, current_casebase)
126
181
 
127
182
  results.append(result)
128
183
  current_casebase = result.casebase
@@ -63,6 +63,12 @@ def aggregator(
63
63
  >>> agg = aggregator("mean")
64
64
  >>> agg([0.5, 0.75, 1.0])
65
65
  0.75
66
+ >>> agg = aggregator("mean", {1: 1, 2: 1, 3: 0})
67
+ >>> agg({1: 1, 2: 1, 3: 1})
68
+ 1.0
69
+ >>> agg = aggregator("mean", {1: 1, 2: 1, 3: 2})
70
+ >>> agg({1: 1, 2: 1, 3: 1})
71
+ 1.0
66
72
  """
67
73
 
68
74
  pooling_func = _pooling_funcs[pooling] if isinstance(pooling, str) else pooling
@@ -70,6 +76,7 @@ def aggregator(
70
76
  def wrapped_func(similarities: SimSeqOrMap[KeyType, AnyFloat]) -> float:
71
77
  assert pooling_weights is None or type(similarities) is type(pooling_weights)
72
78
 
79
+ pooling_factor = 1.0
73
80
  sims: Sequence[float] # noqa: F821
74
81
 
75
82
  if isinstance(similarities, Mapping) and isinstance(pooling_weights, Mapping):
@@ -77,6 +84,10 @@ def aggregator(
77
84
  unpack_sim(sim) * pooling_weights.get(key, default_pooling_weight)
78
85
  for key, sim in similarities.items()
79
86
  ]
87
+ pooling_factor = len(similarities) / sum(
88
+ pooling_weights.get(key, default_pooling_weight)
89
+ for key in similarities.keys()
90
+ )
80
91
  elif isinstance(similarities, Sequence) and isinstance(
81
92
  pooling_weights, Sequence
82
93
  ):
@@ -84,6 +95,7 @@ def aggregator(
84
95
  unpack_sim(s) * w
85
96
  for s, w in zip(similarities, pooling_weights, strict=True)
86
97
  ]
98
+ pooling_factor = len(similarities) / sum(pooling_weights)
87
99
  elif isinstance(similarities, Sequence) and pooling_weights is None:
88
100
  sims = [unpack_sim(s) for s in similarities]
89
101
  elif isinstance(similarities, Mapping) and pooling_weights is None:
@@ -91,6 +103,6 @@ def aggregator(
91
103
  else:
92
104
  raise NotImplementedError()
93
105
 
94
- return pooling_func(sims)
106
+ return pooling_func(sims) * pooling_factor
95
107
 
96
108
  return wrapped_func
@@ -8,7 +8,17 @@ from cbrkit.typing import FloatProtocol, SimPairFunc, SimType, ValueType
8
8
 
9
9
  Number = float | int
10
10
 
11
- __all__ = ["jaccard", "smith_waterman", "dtw"]
11
+ __all__ = [
12
+ "jaccard",
13
+ "smith_waterman",
14
+ "dtw",
15
+ "isolated_mapping",
16
+ "mapping",
17
+ "sequence_mapping",
18
+ "sequence_correctness",
19
+ "SequenceSim",
20
+ "Weight",
21
+ ]
12
22
 
13
23
 
14
24
  def jaccard() -> SimPairFunc[Collection[Any], float]:
@@ -1,6 +1,6 @@
1
1
  from collections.abc import Hashable
2
2
  from dataclasses import dataclass
3
- from typing import Generic, Protocol, TypeVar
3
+ from typing import Generic, Protocol, TypeVar, runtime_checkable
4
4
 
5
5
  NodeKey = TypeVar("NodeKey")
6
6
  NodeData = TypeVar("NodeData")
@@ -9,12 +9,14 @@ EdgeData = TypeVar("EdgeData")
9
9
  GraphData = TypeVar("GraphData")
10
10
 
11
11
 
12
+ @runtime_checkable
12
13
  class EdgeProtocol(Hashable, Protocol[EdgeData, NodeKey]):
13
14
  source: NodeKey
14
15
  target: NodeKey
15
16
  data: EdgeData
16
17
 
17
18
 
19
+ @runtime_checkable
18
20
  class NodeProtocol(Hashable, Protocol[NodeData]):
19
21
  data: NodeData
20
22
 
@@ -4,15 +4,37 @@ from cbrkit.typing import SimPairFunc
4
4
 
5
5
  Number = float | int
6
6
 
7
- __all__ = ["linear", "threshold", "exponential", "sigmoid"]
7
+ __all__ = ["linear_interval", "linear", "threshold", "exponential", "sigmoid"]
8
+
9
+
10
+ def linear_interval(min: float, max: float) -> SimPairFunc[Number, float]:
11
+ """Linear similarity function based on the distance between two values within a range.
12
+
13
+ Args:
14
+ min: Lower bound of the interval. Should be the minimal value of the entire case base.
15
+ max: Upper bound of the interval. Should be the maximal value of the entire case base.
16
+
17
+ Examples:
18
+ >>> sim = linear_interval(1950, 2000)
19
+ >>> sim(1950, 1975)
20
+ 0.5
21
+ """
22
+
23
+ def wrapped_func(x: Number, y: Number) -> float:
24
+ if x < min or x > max or y < min or y > max:
25
+ return 0.0
26
+
27
+ return 1.0 - abs(x - y) / (max - min)
28
+
29
+ return wrapped_func
8
30
 
9
31
 
10
32
  def linear(max: float, min: float = 0.0) -> SimPairFunc[Number, float]:
11
- """Linear similarity function.
33
+ """Linear similarity function based on the distance between two values.
12
34
 
13
35
  Args:
14
- max: Maximum bound of the interval
15
- min: Minimum bound of the interval
36
+ max: Maximum bound of the distance (i.e., the point where the similarity is 0.0)
37
+ min: Minimum bound of the distance (i.e., the point where the similarity is 1.0)
16
38
 
17
39
  ![linear](../../assets/numeric/linear.png)
18
40
 
@@ -1,11 +1,9 @@
1
1
  from collections.abc import Mapping, Sequence
2
2
  from pathlib import Path
3
- from typing import (
4
- Protocol,
5
- TypeVar,
6
- )
3
+ from typing import Protocol, TypeVar, runtime_checkable
7
4
 
8
5
 
6
+ @runtime_checkable
9
7
  class FloatProtocol(Protocol):
10
8
  value: float
11
9
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "cbrkit"
3
- version = "0.12.2"
3
+ version = "0.13.0"
4
4
  description = "Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI."
5
5
  authors = ["Mirko Lenz <mirko@mirkolenz.com>"]
6
6
  license = "MIT"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes