cbrkit 0.2.1__tar.gz → 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {cbrkit-0.2.1 → cbrkit-0.3.1}/PKG-INFO +39 -18
  2. {cbrkit-0.2.1 → cbrkit-0.3.1}/README.md +28 -8
  3. {cbrkit-0.2.1 → cbrkit-0.3.1}/cbrkit/global_sim/_aggregate.py +14 -0
  4. {cbrkit-0.2.1 → cbrkit-0.3.1}/cbrkit/global_sim/_attribute_value.py +13 -0
  5. {cbrkit-0.2.1 → cbrkit-0.3.1}/cbrkit/global_sim/graph/_astar.py +14 -2
  6. {cbrkit-0.2.1 → cbrkit-0.3.1}/cbrkit/global_sim/graph/_model.py +1 -1
  7. {cbrkit-0.2.1 → cbrkit-0.3.1}/cbrkit/loaders.py +157 -6
  8. {cbrkit-0.2.1 → cbrkit-0.3.1}/cbrkit/retrieval.py +67 -2
  9. {cbrkit-0.2.1 → cbrkit-0.3.1}/cbrkit/sim/_helpers.py +23 -0
  10. {cbrkit-0.2.1 → cbrkit-0.3.1}/cbrkit/sim/collections.py +9 -0
  11. {cbrkit-0.2.1 → cbrkit-0.3.1}/cbrkit/sim/generic.py +27 -0
  12. {cbrkit-0.2.1 → cbrkit-0.3.1}/cbrkit/sim/numeric.py +20 -1
  13. {cbrkit-0.2.1 → cbrkit-0.3.1}/cbrkit/sim/strings.py +83 -1
  14. {cbrkit-0.2.1 → cbrkit-0.3.1}/cbrkit/sim/taxonomy.py +21 -1
  15. {cbrkit-0.2.1 → cbrkit-0.3.1}/pyproject.toml +22 -28
  16. {cbrkit-0.2.1 → cbrkit-0.3.1}/LICENSE +0 -0
  17. {cbrkit-0.2.1 → cbrkit-0.3.1}/cbrkit/__init__.py +0 -0
  18. {cbrkit-0.2.1 → cbrkit-0.3.1}/cbrkit/__main__.py +0 -0
  19. {cbrkit-0.2.1 → cbrkit-0.3.1}/cbrkit/api.py +0 -0
  20. {cbrkit-0.2.1 → cbrkit-0.3.1}/cbrkit/cli.py +0 -0
  21. {cbrkit-0.2.1 → cbrkit-0.3.1}/cbrkit/global_sim/__init__.py +0 -0
  22. {cbrkit-0.2.1 → cbrkit-0.3.1}/cbrkit/global_sim/graph/__init__.py +0 -0
  23. {cbrkit-0.2.1 → cbrkit-0.3.1}/cbrkit/py.typed +0 -0
  24. {cbrkit-0.2.1 → cbrkit-0.3.1}/cbrkit/sim/__init__.py +0 -0
  25. {cbrkit-0.2.1 → cbrkit-0.3.1}/cbrkit/typing.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cbrkit
3
- Version: 0.2.1
3
+ Version: 0.3.1
4
4
  Summary: Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI.
5
5
  Home-page: https://wi2trier.github.io/cbrkit/
6
6
  License: MIT
@@ -28,20 +28,21 @@ Provides-Extra: all
28
28
  Provides-Extra: api
29
29
  Provides-Extra: cli
30
30
  Provides-Extra: nlp
31
- Requires-Dist: fastapi[all] (>=0.104,<0.105) ; extra == "all" or extra == "api"
32
- Requires-Dist: levenshtein (>=0.23,<0.24) ; extra == "all" or extra == "nlp"
31
+ Provides-Extra: transformers
32
+ Requires-Dist: fastapi[all] (>=0.100,<1.0) ; extra == "all" or extra == "api"
33
+ Requires-Dist: levenshtein (>=0.23,<1.0) ; extra == "all" or extra == "nlp"
33
34
  Requires-Dist: nltk (>=3.8,<4.0) ; extra == "all" or extra == "nlp"
34
- Requires-Dist: openai (>=1.3,<2.0) ; extra == "all" or extra == "nlp"
35
+ Requires-Dist: openai (>=1.5,<2.0) ; extra == "all" or extra == "nlp"
35
36
  Requires-Dist: orjson (>=3.9,<4.0)
36
37
  Requires-Dist: pandas (>=2.1,<3.0)
37
- Requires-Dist: pyarrow (>=14.0,<15.0)
38
+ Requires-Dist: pyarrow (>=13.0)
38
39
  Requires-Dist: pyyaml (>=6.0,<7.0)
39
- Requires-Dist: sentence-transformers (>=2.2,<3.0) ; extra == "all" or extra == "nlp"
40
- Requires-Dist: spacy (>=3.7,<4.0) ; extra == "all" or extra == "nlp"
41
- Requires-Dist: torch (>=2.1.1,<3.0.0) ; extra == "all" or extra == "nlp"
42
- Requires-Dist: transformers (>=4.36,<5.0) ; extra == "all" or extra == "nlp"
40
+ Requires-Dist: sentence-transformers (>=2.2,<3.0) ; extra == "all" or extra == "transformers"
41
+ Requires-Dist: spacy (>=3.7,<4.0) ; extra == "all" or extra == "all" or extra == "nlp"
42
+ Requires-Dist: torch (>=2.1.1,<3.0.0) ; extra == "all" or extra == "transformers"
43
+ Requires-Dist: transformers (>=4.35,<5.0) ; extra == "all" or extra == "transformers"
43
44
  Requires-Dist: typer[all] (>=0.9,<0.10) ; extra == "all" or extra == "cli"
44
- Requires-Dist: uvicorn[standard] (>=0.24,<0.25) ; extra == "all" or extra == "api"
45
+ Requires-Dist: uvicorn[standard] (>=0.24,<1.0) ; extra == "all" or extra == "api"
45
46
  Requires-Dist: xmltodict (>=0.13,<0.14)
46
47
  Project-URL: Repository, https://github.com/wi2trier/cbrkit
47
48
  Description-Content-Type: text/markdown
@@ -64,11 +65,6 @@ Description-Content-Type: text/markdown
64
65
 
65
66
  # CBRkit
66
67
 
67
- > [!caution]
68
- > The project is under active development and does not yet adhere to semantic versioning.
69
- > Breaking changes may occur at any time for versions `0.x.y`.
70
- > Once the project reaches version `1.0`, semantic versioning will be applied.
71
-
72
68
  ## Installation
73
69
 
74
70
  The library is available on [PyPI](https://pypi.org/project/cbrkit/), so you can install it with `pip`:
@@ -85,7 +81,8 @@ pip install cbrkit[EXTRA_NAME,...]
85
81
 
86
82
  where `EXTRA_NAME` is one of the following:
87
83
 
88
- - `nlp`: Natural Language Processing (NLP), including `spacy`, `openai`, and `sentence-transformers`
84
+ - `nlp`: Standalone NLP tools `levenshtein`, `nltk`, `openai`, and `spacy`
85
+ - `transformers`: NLP tools based on `pytorch` and `transformers`
89
86
  - `cli`: Command Line Interface (CLI)
90
87
  - `api`: REST API Server
91
88
  - `all`: All of the above
@@ -95,12 +92,36 @@ where `EXTRA_NAME` is one of the following:
95
92
  CBRkit allows the definition of similarity metrics through _composition_.
96
93
  This means that you can easily build even complex similarities by mixing built-in and/or custom measures.
97
94
  CBRkit also includes predefined aggregation functions.
98
- A working retrieval example can be found as part of our [testing suite](https://github.com/wi2trier/cbrkit/tree/main/tests/test_retrieve.py).
99
-
95
+ To get started, we provide a [demo project](https://github.com/wi2trier/cbrkit-demo) that shows how to use the library in a real-world scenario.
100
96
  The following modules are part of CBRkit:
101
97
 
98
+ - `loaders`: Functions for loading cases and queries.
102
99
  - `sim`: Similarity generator functions for various data types (e.g., strings, numbers).
103
100
  - `global_sim`: Similarity generator functions for aggregating the above ones.
104
101
  - `retrieval`: Functions for retrieving cases based on a query.
105
102
  - `typing`: Generic type definitions for defining custom functions.
106
103
 
104
+ CBRkit is fully typed, so IDEs like VSCode and PyCharm can provide autocompletion and type checking.
105
+ We will explain all modules and their basic usage in the following sections.
106
+
107
+ ### Loading Cases
108
+
109
+ The first step is to load cases and queries.
110
+ We provide predefined functions for the most common formats like CSV, JSON, and XML.
111
+ Additionally, `cbrkit` also integrates with `pandas` for loading data frames.
112
+ The following example shows how to load cases and queries from a CSV file using `pandas`:
113
+
114
+ ```python
115
+ import pandas as pd
116
+ import cbrkit
117
+
118
+ df = pd.read_csv("path/to/cases.csv")
119
+ cases = cbrkit.loaders.dataframe(df)
120
+ ```
121
+
122
+ Queries can either be loaded using the same loader functions or constructed manually.
123
+
124
+ ```python
125
+ queries = cbrkit.loaders.dataframe(pd.read_csv("path/to/queries.csv"))
126
+ ```
127
+
@@ -16,11 +16,6 @@
16
16
 
17
17
  # CBRkit
18
18
 
19
- > [!caution]
20
- > The project is under active development and does not yet adhere to semantic versioning.
21
- > Breaking changes may occur at any time for versions `0.x.y`.
22
- > Once the project reaches version `1.0`, semantic versioning will be applied.
23
-
24
19
  ## Installation
25
20
 
26
21
  The library is available on [PyPI](https://pypi.org/project/cbrkit/), so you can install it with `pip`:
@@ -37,7 +32,8 @@ pip install cbrkit[EXTRA_NAME,...]
37
32
 
38
33
  where `EXTRA_NAME` is one of the following:
39
34
 
40
- - `nlp`: Natural Language Processing (NLP), including `spacy`, `openai`, and `sentence-transformers`
35
+ - `nlp`: Standalone NLP tools `levenshtein`, `nltk`, `openai`, and `spacy`
36
+ - `transformers`: NLP tools based on `pytorch` and `transformers`
41
37
  - `cli`: Command Line Interface (CLI)
42
38
  - `api`: REST API Server
43
39
  - `all`: All of the above
@@ -47,11 +43,35 @@ where `EXTRA_NAME` is one of the following:
47
43
  CBRkit allows the definition of similarity metrics through _composition_.
48
44
  This means that you can easily build even complex similarities by mixing built-in and/or custom measures.
49
45
  CBRkit also includes predefined aggregation functions.
50
- A working retrieval example can be found as part of our [testing suite](https://github.com/wi2trier/cbrkit/tree/main/tests/test_retrieve.py).
51
-
46
+ To get started, we provide a [demo project](https://github.com/wi2trier/cbrkit-demo) that shows how to use the library in a real-world scenario.
52
47
  The following modules are part of CBRkit:
53
48
 
49
+ - `loaders`: Functions for loading cases and queries.
54
50
  - `sim`: Similarity generator functions for various data types (e.g., strings, numbers).
55
51
  - `global_sim`: Similarity generator functions for aggregating the above ones.
56
52
  - `retrieval`: Functions for retrieving cases based on a query.
57
53
  - `typing`: Generic type definitions for defining custom functions.
54
+
55
+ CBRkit is fully typed, so IDEs like VSCode and PyCharm can provide autocompletion and type checking.
56
+ We will explain all modules and their basic usage in the following sections.
57
+
58
+ ### Loading Cases
59
+
60
+ The first step is to load cases and queries.
61
+ We provide predefined functions for the most common formats like CSV, JSON, and XML.
62
+ Additionally, `cbrkit` also integrates with `pandas` for loading data frames.
63
+ The following example shows how to load cases and queries from a CSV file using `pandas`:
64
+
65
+ ```python
66
+ import pandas as pd
67
+ import cbrkit
68
+
69
+ df = pd.read_csv("path/to/cases.csv")
70
+ cases = cbrkit.loaders.dataframe(df)
71
+ ```
72
+
73
+ Queries can either be loaded using the same loader functions or constructed manually.
74
+
75
+ ```python
76
+ queries = cbrkit.loaders.dataframe(pd.read_csv("path/to/queries.csv"))
77
+ ```
@@ -51,6 +51,20 @@ def aggregator(
51
51
  pooling_weights: SimSeqOrMap[KeyType, float] | None = None,
52
52
  default_pooling_weight: float = 1.0,
53
53
  ) -> AggregatorFunc[KeyType, AnyFloat]:
54
+ """
55
+ Aggregates local similarities to a global similarity using the specified pooling function.
56
+
57
+ Args:
58
+ pooling: The pooling function to use. It can be either a string representing the name of the pooling function or a custom pooling function (see `cbrkit.typing.PoolingFunc`).
59
+ pooling_weights: The weights to apply to the similarities during pooling. It can be a sequence or a mapping. If None, every local similarity is weighted equally.
60
+ default_pooling_weight: The default weight to use if a similarity key is not found in the pooling_weights mapping.
61
+
62
+ Examples:
63
+ >>> global_sim = aggregator("mean")
64
+ >>> global_sim([0.5, 0.75, 1.0])
65
+ 0.75
66
+ """
67
+
54
68
  pooling_func = _pooling_funcs[pooling] if isinstance(pooling, str) else pooling
55
69
 
56
70
  def wrapped_func(similarities: SimSeqOrMap[KeyType, AnyFloat]) -> float:
@@ -60,6 +60,19 @@ def attribute_value(
60
60
  value_getter: Callable[[Any, str], Any] = _value_getter,
61
61
  key_getter: Callable[[Any], Iterator[str]] = _key_getter,
62
62
  ) -> SimMapFunc[Any, AttributeValueData, AttributeValueSim[SimType]]:
63
+ """
64
+ Similarity function that computes the attribute value similarity between two cases.
65
+
66
+ Args:
67
+ attributes: A mapping of attribute names to the similarity functions to be used for those attributes. Takes precedence over types.
68
+ types: A mapping of attribute types to the similarity functions to be used for those types.
69
+ types_fallback: A similarity function to be used as a fallback when no specific similarity function
70
+ is defined for an attribute type.
71
+ aggregator: A function that aggregates the local similarity scores for each attribute into a single global similarity.
72
+ value_getter: A function that retrieves the value of an attribute from a case.
73
+ key_getter: A function that retrieves the attribute names from a target case.
74
+ """
75
+
63
76
  attributes_map: Mapping[str, AnySimFunc[KeyType, Any, SimType]] = (
64
77
  {} if attributes is None else attributes
65
78
  )
@@ -22,7 +22,7 @@ from cbrkit.typing import Casebase, FloatProtocol, KeyType, SimPairFunc, SimType
22
22
  logger = logging.getLogger(__name__)
23
23
 
24
24
 
25
- @dataclass
25
+ @dataclass(slots=True)
26
26
  class GraphMapping(Generic[GraphData, NodeKey, NodeData, EdgeKey, EdgeData]):
27
27
  """Store all mappings and perform integrity checks on them"""
28
28
 
@@ -107,7 +107,7 @@ class GraphMapping(Generic[GraphData, NodeKey, NodeData, EdgeKey, EdgeData]):
107
107
  self.edge_mappings[x] = y
108
108
 
109
109
 
110
- @dataclass
110
+ @dataclass(slots=True)
111
111
  class SearchNode(Generic[GraphData, NodeKey, NodeData, EdgeKey, EdgeData]):
112
112
  """Specific search node"""
113
113
 
@@ -149,6 +149,18 @@ def astar(
149
149
  edge_sim_func: SimPairFunc[EdgeData, SimType],
150
150
  queue_limit: int,
151
151
  ) -> dict[KeyType, GraphSim[GraphData, NodeKey, NodeData, EdgeKey, EdgeData]]:
152
+ """
153
+ Performs the A* algorithm proposed by [Bergmann and Gil (2014)](https://doi.org/10.1016/j.is.2012.07.005) to compute the similarity between a query graph and the graphs in the casebase.
154
+
155
+ Args:
156
+ x_map: A casebase of graphs
157
+ y: Query graph
158
+ node_sim_func: A similarity function for graph nodes
159
+ edge_sim_func: A similarity function for graph edges
160
+ queue_limit: Limits the queue size which prunes the search space. This leads to a faster search and less memory usage but also introduces a similarity error.
161
+
162
+ """
163
+
152
164
  results = {
153
165
  key: _astar_single(
154
166
  x,
@@ -19,7 +19,7 @@ class NodeProtocol(Hashable, Protocol[NodeData]):
19
19
  data: NodeData
20
20
 
21
21
 
22
- @dataclass
22
+ @dataclass(slots=True)
23
23
  class Graph(Generic[GraphData, NodeKey, NodeData, EdgeKey, EdgeData]):
24
24
  nodes: dict[NodeKey, NodeProtocol[NodeData]]
25
25
  edges: dict[EdgeKey, EdgeProtocol[EdgeData, NodeKey]]
@@ -53,6 +53,8 @@ def python(import_name: str) -> Any:
53
53
 
54
54
 
55
55
  class DataFrameCasebase(abc.Mapping):
56
+ __slots__ = ("df",)
57
+
56
58
  df: DataFrame
57
59
 
58
60
  def __init__(self, df: DataFrame) -> None:
@@ -74,10 +76,35 @@ class DataFrameCasebase(abc.Mapping):
74
76
 
75
77
 
76
78
  def dataframe(df: DataFrame) -> Casebase[Any, pd.Series]:
79
+ """Converts a pandas DataFrame into a Casebase.
80
+
81
+ Args:
82
+ df: pandas DataFrame.
83
+
84
+ Returns:
85
+ Returns a Casebase as a DataFrameCasebase.
86
+
87
+ Examples:
88
+ >>> file_path = "./data/cars-1k.csv"
89
+ >>> df = pd.read_csv(file_path)
90
+ >>> result = dataframe(df)
91
+ """
77
92
  return DataFrameCasebase(df)
78
93
 
79
94
 
80
95
  def csv(path: FilePath) -> dict[int, dict[str, str]]:
96
+ """Reads a csv file and converts it into a dict representation
97
+
98
+ Args:
99
+ path: File path of the csv file
100
+
101
+ Returns:
102
+ Dict representation of the csv file.
103
+
104
+ Examples:
105
+ >>> file_path = "./data/cars-1k.csv"
106
+ >>> result = csv(file_path)
107
+ """
81
108
  data: dict[int, dict[str, str]] = {}
82
109
 
83
110
  with open(path) as fp:
@@ -96,32 +123,105 @@ def _csv_pandas(path: FilePath) -> dict[int, pd.Series]:
96
123
  return cast(dict[int, pd.Series], dataframe(df))
97
124
 
98
125
 
99
- def json(path: FilePath) -> dict[str, Any]:
126
+ def json(path: FilePath) -> dict[Any, Any]:
127
+ """Reads a json file and converts it into a dict representation
128
+
129
+ Args:
130
+ path: File path of the json file
131
+
132
+ Returns:
133
+ Dict representation of the json file.
134
+
135
+ Examples:
136
+ >>> file_path = "data/cars-1k.json" # doctest: +SKIP
137
+ >>> json(file_path) # doctest: +SKIP
138
+ """
100
139
  with open(path, "rb") as fp:
101
- return orjson.loads(fp.read())
140
+ data = orjson.loads(fp.read())
141
+
142
+ if isinstance(data, list):
143
+ return dict(enumerate(data))
144
+ elif isinstance(data, dict):
145
+ return data
146
+ else:
147
+ raise TypeError(f"Invalid data type: {type(data)}")
102
148
 
103
149
 
104
150
  def toml(path: FilePath) -> dict[str, Any]:
151
+ """Reads a toml file and parses it into a dict representation
152
+
153
+ Args:
154
+ path: File path of the toml file
155
+
156
+ Returns:
157
+ Dict representation of the toml file.
158
+
159
+ Examples:
160
+ >>> file_path = "./data/file.toml" # doctest: +SKIP
161
+ >>> toml(file_path) # doctest: +SKIP
162
+ """
105
163
  with open(path, "rb") as fp:
106
164
  return tomllib.load(fp)
107
165
 
108
166
 
109
- def yaml(path: FilePath) -> dict[str, Any]:
110
- data: dict[str, Any] = {}
167
+ def yaml(path: FilePath) -> dict[Any, Any]:
168
+ """Reads a yaml file and parses it into a dict representation
169
+
170
+ Args:
171
+ path: File path of the yaml file
172
+
173
+ Returns:
174
+ Dict representation of the yaml file.
175
+
176
+ Examples:
177
+ >>> file_path = "./data/cars-1k.yaml"
178
+ >>> result = yaml(file_path)
179
+ """
180
+ data: dict[Any, Any] = {}
111
181
 
112
182
  with open(path, "rb") as fp:
113
- for doc in yamllib.safe_load_all(fp):
114
- data |= doc
183
+ for doc_idx, doc in enumerate(yamllib.safe_load_all(fp)):
184
+ if isinstance(doc, list):
185
+ for idx, item in enumerate(doc):
186
+ data[doc_idx + idx] = item
187
+ elif isinstance(doc, dict):
188
+ data |= doc
189
+ else:
190
+ raise TypeError(f"Invalid document type: {type(doc)}")
115
191
 
116
192
  return data
117
193
 
118
194
 
119
195
  def txt(path: FilePath) -> str:
196
+ """Reads a text file and converts it into a string
197
+
198
+ Args:
199
+ path: File path of the text file
200
+
201
+ Returns:
202
+ String representation of the text file.
203
+
204
+ Examples:
205
+ >>> file_path = "data/file.txt" # doctest: +SKIP
206
+ >>> txt(file_path) # doctest: +SKIP
207
+ """
120
208
  with open(path) as fp:
121
209
  return fp.read()
122
210
 
123
211
 
124
212
  def xml(path: FilePath) -> dict[str, Any]:
213
+ """Reads a xml file and parses it into a dict representation
214
+
215
+ Args:
216
+ path: File path of the xml file
217
+
218
+ Returns:
219
+ Dict representation of the xml file.
220
+
221
+ Examples:
222
+ >>> file_path = "data/file.xml" # doctest: +SKIP
223
+ >>> result = xml(file_path) # doctest: +SKIP
224
+ """
125
225
  with open(path, "rb") as fp:
126
226
  data = xmltodict.parse(fp.read())
127
227
 
@@ -159,6 +259,18 @@ _single_loaders: dict[str, SingleLoader] = {
159
259
 
160
260
 
161
261
  def data(path: FilePath) -> dict[str, Any]:
262
+ """Reads files of types json, toml, yaml, and yml and parses it into a dict representation
263
+
264
+ Args:
265
+ path: Path of the file
266
+
267
+ Returns:
268
+ Dict representation of the file.
269
+
270
+ Examples:
271
+ >>> yaml_file = "./data/cars-1k.yaml"
272
+ >>> result = data(yaml_file)
273
+ """
162
274
  if isinstance(path, str):
163
275
  path = Path(path)
164
276
 
@@ -170,6 +282,18 @@ def data(path: FilePath) -> dict[str, Any]:
170
282
 
171
283
 
172
284
  def path(path: FilePath, pattern: str | None = None) -> Casebase[Any, Any]:
285
+ """Converts a path into a Casebase. The path can be a folder or a file.
286
+
287
+ Args:
288
+ path: Path of the file.
289
+
290
+ Returns:
291
+ Returns a Casebase.
292
+
293
+ Examples:
294
+ >>> file_path = "./data/cars-1k.csv"
295
+ >>> result = path(file_path)
296
+ """
173
297
  if isinstance(path, str):
174
298
  path = Path(path)
175
299
 
@@ -189,6 +313,19 @@ def path(path: FilePath, pattern: str | None = None) -> Casebase[Any, Any]:
189
313
 
190
314
 
191
315
  def file(path: Path) -> Casebase[Any, Any] | None:
316
+ """Converts a file into a Casebase. The file can be of type csv, json, toml, yaml, or yml.
317
+
318
+ Args:
319
+ path: Path of the file.
320
+
321
+ Returns:
322
+ Returns a Casebase.
323
+
324
+ Examples:
325
+ >>> from pathlib import Path
326
+ >>> file_path = Path("./data/cars-1k.csv")
327
+ >>> result = file(file_path)
328
+ """
192
329
  if path.suffix not in _batch_loaders:
193
330
  return None
194
331
 
@@ -199,6 +336,20 @@ def file(path: Path) -> Casebase[Any, Any] | None:
199
336
 
200
337
 
201
338
  def folder(path: Path, pattern: str) -> Casebase[Any, Any] | None:
339
+ """Converts the files of a folder into a Casebase. The files can be of type txt, csv, json, toml, yaml, or yml.
340
+
341
+ Args:
342
+ path: Path of the folder.
343
+ pattern: Relative pattern for the files.
344
+
345
+ Returns:
346
+ Returns a Casebase.
347
+
348
+ Examples:
349
+ >>> from pathlib import Path
350
+ >>> folder_path = Path("./data")
351
+ >>> result = folder(folder_path, ".csv")
352
+ """
202
353
  cb: Casebase[Any, Any] = {}
203
354
 
204
355
  for file in path.glob(pattern):
@@ -29,7 +29,7 @@ def _similarities2ranking(
29
29
  return sorted(sim_map, key=lambda key: unpack_sim(sim_map[key]), reverse=True)
30
30
 
31
31
 
32
- @dataclass
32
+ @dataclass(slots=True)
33
33
  class _Result(Generic[KeyType, ValueType, SimType]):
34
34
  similarities: SimMap[KeyType, SimType]
35
35
  ranking: list[KeyType]
@@ -47,7 +47,7 @@ class _Result(Generic[KeyType, ValueType, SimType]):
47
47
  return cls(similarities=similarities, ranking=ranking, casebase=casebase)
48
48
 
49
49
 
50
- @dataclass
50
+ @dataclass(slots=True)
51
51
  class Result(Generic[KeyType, ValueType, SimType]):
52
52
  final: _Result[KeyType, ValueType, SimType]
53
53
  intermediate: list[_Result[KeyType, ValueType, SimType]]
@@ -78,6 +78,40 @@ def apply(
78
78
  retrievers: RetrieveFunc[KeyType, ValueType, SimType]
79
79
  | Sequence[RetrieveFunc[KeyType, ValueType, SimType]],
80
80
  ) -> Result[KeyType, ValueType, SimType]:
81
+ """Applies a query to a Casebase using retriever functions.
82
+
83
+ Args:
84
+ casebase: The casebase for the query.
85
+ query: The query that will be applied to the casebase
86
+ retrievers: Retriever functions that will retrieve similar cases (compared to the query) from the casebase
87
+
88
+ Returns:
89
+ Returns an object of type Result.
90
+
91
+ Examples:
92
+ >>> import cbrkit
93
+ >>> import pandas as pd
94
+ >>> df = pd.read_csv("./data/cars-1k.csv")
95
+ >>> casebase = cbrkit.loaders.dataframe(df)
96
+ >>> query = casebase[42]
97
+ >>> retriever = cbrkit.retrieval.build(
98
+ ... cbrkit.global_sim.attribute_value(
99
+ ... attributes={
100
+ ... "price": cbrkit.sim.numeric.linear(max=100000),
101
+ ... "year": cbrkit.sim.numeric.linear(max=50),
102
+ ... "manufacturer": cbrkit.sim.taxonomy.load(
103
+ ... "./data/cars-taxonomy.yaml",
104
+ ... measure=cbrkit.sim.taxonomy.wu_palmer(),
105
+ ... ),
106
+ ... "miles": cbrkit.sim.numeric.linear(max=1000000),
107
+ ... },
108
+ ... types_fallback=cbrkit.sim.generic.equality(),
109
+ ... aggregator=cbrkit.global_sim.aggregator(pooling="mean"),
110
+ ... ),
111
+ ... limit=5,
112
+ ... )
113
+ >>> result = cbrkit.retrieval.apply(casebase, query, retriever)
114
+ """
81
115
  if not isinstance(retrievers, Sequence):
82
116
  retrievers = [retrievers]
83
117
 
@@ -99,6 +133,37 @@ def build(
99
133
  similarity_func: AnySimFunc[KeyType, ValueType, SimType],
100
134
  limit: int | None = None,
101
135
  ) -> RetrieveFunc[KeyType, ValueType, SimType]:
136
+ """Based on the similarity function this function creates a retriever function.
137
+
138
+ Args:
139
+ similarity_func: Similarity function to compute the similarity between cases.
140
+ limit: Retriever function will return the top limit cases.
141
+
142
+ Returns:
143
+ Returns the retriever function.
144
+
145
+ Examples:
146
+ >>> import cbrkit
147
+ >>> retriever = cbrkit.retrieval.build(
148
+ ... cbrkit.global_sim.attribute_value(
149
+ ... attributes={
150
+ ... "price": cbrkit.sim.numeric.linear(max=100000),
151
+ ... "year": cbrkit.sim.numeric.linear(max=50),
152
+ ... "model": cbrkit.global_sim.attribute_value(
153
+ ... attributes={
154
+ ... "make": cbrkit.sim.generic.equality(),
155
+ ... "manufacturer": cbrkit.sim.taxonomy.load(
156
+ ... "./data/cars-taxonomy.yaml",
157
+ ... measure=cbrkit.sim.taxonomy.wu_palmer(),
158
+ ... ),
159
+ ... }
160
+ ... ),
161
+ ... },
162
+ ... aggregator=cbrkit.global_sim.aggregator(pooling="mean"),
163
+ ... ),
164
+ ... limit=5,
165
+ ... )
166
+ """
102
167
  sim_func = sim2map(similarity_func)
103
168
 
104
169
  def wrapped_func(
@@ -26,12 +26,35 @@ __all__ = [
26
26
 
27
27
 
28
28
  def dist2sim(distance: float) -> float:
29
+ """Convert a distance to a similarity.
30
+
31
+ Args:
32
+ distance: The distance to convert
33
+
34
+ Examples:
35
+ >>> dist2sim(1.)
36
+ 0.5
37
+ """
29
38
  return 1 / (1 + distance)
30
39
 
31
40
 
32
41
  def sim2seq(
33
42
  func: SimPairFunc[ValueType, SimType] | SimSeqFunc[ValueType, SimType],
34
43
  ) -> SimSeqFunc[ValueType, SimType]:
44
+ """
45
+ Converts a similarity function that operates on pairs of values into a similarity function that operates on sequences of values.
46
+
47
+ Args:
48
+ func: The similarity function to be converted.
49
+
50
+ Examples:
51
+ >>> def sim_func(x: int, y: int) -> float:
52
+ ... return abs(x - y) / max(x, y)
53
+ ...
54
+ >>> seq_func = sim2seq(sim_func)
55
+ >>> seq_func([(1, 2), (3, 4), (5, 6)])
56
+ [0.5, 0.25, 0.16666666666666666]
57
+ """
35
58
  signature = inspect_signature(func)
36
59
 
37
60
  if len(signature.parameters) == 2:
@@ -4,8 +4,17 @@ from typing import Any
4
4
  from cbrkit.sim._helpers import dist2sim
5
5
  from cbrkit.typing import SimPairFunc
6
6
 
7
+ __all__ = ["jaccard"]
8
+
7
9
 
8
10
  def jaccard() -> SimPairFunc[Collection[Any], float]:
11
+ """Jaccard similarity function.
12
+
13
+ Examples:
14
+ >>> sim = jaccard()
15
+ >>> sim(["a", "b", "c", "d"], ["a", "b", "c"])
16
+ 0.8
17
+ """
9
18
  from nltk.metrics import jaccard_distance
10
19
 
11
20
  def wrapped_func(x: Collection[Any], y: Collection[Any]) -> float:
@@ -7,12 +7,29 @@ from cbrkit.typing import (
7
7
  ValueType,
8
8
  )
9
9
 
10
+ __all__ = ["table", "equality"]
11
+
10
12
 
11
13
  def table(
12
14
  entries: Sequence[tuple[ValueType, ValueType, float]],
13
15
  symmetric: bool = True,
14
16
  default: float = 0.0,
15
17
  ) -> SimPairFunc[ValueType, float]:
18
+ """Allows to import a similarity values from a table.
19
+
20
+ Args:
21
+ entries: Sequence[tuple[a, b, sim(a, b)]
22
+ symmetric: If True, the table is assumed to be symmetric, i.e. sim(a, b) = sim(b, a)
23
+ default: Default similarity value for pairs not in the table
24
+
25
+ Examples:
26
+ >>> sim = table([("a", "b", 0.5), ("b", "c", 0.7)], symmetric=True, default=0.0)
27
+ >>> sim("b", "a")
28
+ 0.5
29
+ >>> sim("a", "c")
30
+ 0.0
31
+ """
32
+
16
33
  table: defaultdict[ValueType, defaultdict[ValueType, float]] = defaultdict(
17
34
  lambda: defaultdict(lambda: default)
18
35
  )
@@ -30,6 +47,16 @@ def table(
30
47
 
31
48
 
32
49
  def equality() -> SimPairFunc[Any, float]:
50
+ """Equality similarity function. Returns 1.0 if the two values are equal, 0.0 otherwise.
51
+
52
+ Examples:
53
+ >>> sim = equality()
54
+ >>> sim("b", "a")
55
+ 0.0
56
+ >>> sim("a", "a")
57
+ 1.0
58
+ """
59
+
33
60
  def wrapped_func(x: Any, y: Any) -> float:
34
61
  return 1.0 if x == y else 0.0
35
62
 
@@ -15,6 +15,9 @@ def linear(max: float, min: float = 0.0) -> SimPairFunc[Number, float]:
15
15
  min: Minimum bound of the interval
16
16
 
17
17
  ![linear](../../assets/numeric/linear.png)
18
+ >>> sim = linear(100)
19
+ >>> sim(50, 60)
20
+ 0.9
18
21
  """
19
22
 
20
23
  def wrapped_func(x: Number, y: Number) -> float:
@@ -37,6 +40,12 @@ def threshold(threshold: float) -> SimPairFunc[Number, float]:
37
40
  threshold: If the absolute difference between the two values is less than or equal to this value, the similarity is 1.0, otherwise it is 0.0
38
41
 
39
42
  ![threshold](../../assets/numeric/threshold.png)
43
+ Examples:
44
+ >>> sim = threshold(10)
45
+ >>> sim(50, 60)
46
+ 1.0
47
+ >>> sim(50, 61)
48
+ 0.0
40
49
  """
41
50
 
42
51
  def wrapped_func(x: Number, y: Number) -> float:
@@ -49,9 +58,13 @@ def exponential(alpha: float = 1.0) -> SimPairFunc[Number, float]:
49
58
  """Exponential similarity function.
50
59
 
51
60
  Args:
52
- alpha: Controls the growth of the exponential function for the similarity. The larger alpha is, the faster the function grows.
61
+ alpha: Controls the growth of the exponential function for the similarity. The larger alpha is, the faster the similarity decreases.
53
62
 
54
63
  ![exponential](../../assets/numeric/exponential.png)
64
+ Examples:
65
+ >>> sim = exponential(0.1)
66
+ >>> sim(50, 60)
67
+ 0.36787944117144233
55
68
  """
56
69
 
57
70
  def wrapped_func(x: Number, y: Number) -> float:
@@ -68,6 +81,12 @@ def sigmoid(alpha: float = 1.0, theta: float = 1.0) -> SimPairFunc[Number, float
68
81
  theta: Specifies the point at which the similarity value is 0.5.
69
82
 
70
83
  ![sigmoid](../../assets/numeric/sigmoid.png)
84
+ Examples:
85
+ >>> sim = sigmoid(1, 10)
86
+ >>> sim(50, 60)
87
+ 0.5
88
+ >>> sim(50, 58)
89
+ 0.8807970779778823
71
90
  """
72
91
 
73
92
  def wrapped_func(x: Number, y: Number) -> float:
@@ -12,8 +12,24 @@ from cbrkit.typing import (
12
12
  SimSeqFunc,
13
13
  )
14
14
 
15
+ __all__ = [
16
+ "spacy",
17
+ "sentence_transformers",
18
+ "openai",
19
+ "levenshtein",
20
+ "jaro",
21
+ "jaro_winkler",
22
+ "table",
23
+ ]
24
+
15
25
 
16
26
  def _cosine(u, v) -> float:
27
+ """Cosine similarity between two vectors
28
+
29
+ Args:
30
+ u: First vector
31
+ v: Second vector
32
+ """
17
33
  import numpy as np
18
34
  import scipy.spatial.distance as scipy_dist
19
35
 
@@ -28,6 +44,11 @@ def _unique_items(pairs: Sequence[tuple[str, str]]) -> list[str]:
28
44
 
29
45
 
30
46
  def spacy(model_name: str = "en_core_web_lg") -> SimSeqFunc[str, float]:
47
+ """[spaCy](https://spacy.io/usage/linguistic-features/#vectors-similarity) based semantic similarity using word vectors. It calculates the similarity between given text pairs.
48
+
49
+ Args:
50
+ model_name: Name of the [spaCy model](https://spacy.io/usage/models) to use to generate word vectors. Defaults to "en_core_web_lg".
51
+ """
31
52
  from spacy import load as spacy_load
32
53
 
33
54
  nlp = spacy_load(model_name)
@@ -46,6 +67,11 @@ def spacy(model_name: str = "en_core_web_lg") -> SimSeqFunc[str, float]:
46
67
 
47
68
 
48
69
  def sentence_transformers(model_name: str) -> SimSeqFunc[str, float]:
70
+ """[Sentence-Transformers](https://www.sbert.net/) based semantic similarity using word vectors. It calculates the similarity between given text pairs.
71
+
72
+ Args:
73
+ model_name: Name of the [pretrained model](https://www.sbert.net/docs/pretrained_models.html) to use to generate word vectors. It calculates the cosine similarity between given text pairs.
74
+ """
49
75
  from sentence_transformers import SentenceTransformer
50
76
 
51
77
  model = SentenceTransformer(model_name)
@@ -61,6 +87,11 @@ def sentence_transformers(model_name: str) -> SimSeqFunc[str, float]:
61
87
 
62
88
 
63
89
  def openai(model_name: str) -> SimSeqFunc[str, float]:
90
+ """Semantic similarity using word vectors generated by one of OpenAI's embedding models. It calculates the cosine similarity between given text pairs.
91
+
92
+ Args:
93
+ model_name: Name of the [embedding model](https://platform.openai.com/docs/models/embeddings) to use to generate word vectors.
94
+ """
64
95
  import numpy as np
65
96
  from openai import Client
66
97
 
@@ -78,6 +109,18 @@ def openai(model_name: str) -> SimSeqFunc[str, float]:
78
109
 
79
110
 
80
111
  def levenshtein(score_cutoff: float | None = None) -> SimPairFunc[str, float]:
112
+ """Similarity function that calculates a normalized indel similarity between two strings based on [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance).
113
+
114
+ Args:
115
+ score_cutoff: If the similarity is less than this value, the function will return 0.0.
116
+ Examples:
117
+ >>> sim = levenshtein()
118
+ >>> sim("kitten", "sitting")
119
+ 0.6153846153846154
120
+ >>> sim = levenshtein(score_cutoff=0.8)
121
+ >>> sim("kitten", "sitting")
122
+ 0.0
123
+ """
81
124
  import Levenshtein
82
125
 
83
126
  def wrapped_func(x: str, y: str) -> float:
@@ -87,6 +130,18 @@ def levenshtein(score_cutoff: float | None = None) -> SimPairFunc[str, float]:
87
130
 
88
131
 
89
132
  def jaro(score_cutoff: float | None = None) -> SimPairFunc[str, float]:
133
+ """Jaro similarity function to compute similarity between two strings.
134
+
135
+ Args:
136
+ score_cutoff: If the similarity is less than this value, the function will return 0.0.
137
+ Examples:
138
+ >>> sim = jaro()
139
+ >>> sim("kitten", "sitting")
140
+ 0.746031746031746
141
+ >>> sim = jaro(score_cutoff=0.8)
142
+ >>> sim("kitten", "sitting")
143
+ 0.0
144
+ """
90
145
  import Levenshtein
91
146
 
92
147
  def wrapped_func(x: str, y: str) -> float:
@@ -96,8 +151,21 @@ def jaro(score_cutoff: float | None = None) -> SimPairFunc[str, float]:
96
151
 
97
152
 
98
153
  def jaro_winkler(
99
- score_cutoff: float | None = None, prefix_weight: float | None = None
154
+ score_cutoff: float | None = None, prefix_weight: float = 0.1
100
155
  ) -> SimPairFunc[str, float]:
156
+ """Jaro-Winkler similarity function to compute similarity between two strings.
157
+
158
+ Args:
159
+ score_cutoff: If the similarity is less than this value, the function will return 0.0.
160
+ prefix_weight: Weight used for the common prefix of the two strings. Has to be between 0 and 0.25. Default is 0.1.
161
+ Examples:
162
+ >>> sim = jaro_winkler()
163
+ >>> sim("kitten", "sitting")
164
+ 0.746031746031746
165
+ >>> sim = jaro_winkler(score_cutoff=0.8)
166
+ >>> sim("kitten", "sitting")
167
+ 0.0
168
+ """
101
169
  import Levenshtein
102
170
 
103
171
  def wrapped_func(x: str, y: str) -> float:
@@ -113,6 +181,20 @@ def table(
113
181
  symmetric: bool = True,
114
182
  default: float = 0.0,
115
183
  ) -> SimPairFunc[str, float]:
184
+ """Allows to import a similarity values from a table.
185
+
186
+ Args:
187
+ entries: Sequence[tuple[a, b, sim(a, b)]
188
+ symmetric: If True, the table is assumed to be symmetric, i.e. sim(a, b) = sim(b, a)
189
+ default: Default similarity value for pairs not in the table
190
+
191
+ Examples:
192
+ >>> sim = table([("a", "b", 0.5), ("b", "c", 0.7)], symmetric=True, default=0.0)
193
+ >>> sim("b", "a")
194
+ 0.5
195
+ >>> sim("a", "c")
196
+ 0.0
197
+ """
116
198
  if isinstance(entries, FilePath):
117
199
  if isinstance(entries, str):
118
200
  entries = Path(entries)
@@ -4,6 +4,8 @@ from typing import Optional, Protocol, TypedDict, cast
4
4
  from cbrkit.loaders import data as load_data
5
5
  from cbrkit.typing import FilePath, SimPairFunc
6
6
 
7
+ __all__ = ["Taxonomy", "TaxonomyNode", "TaxonomyFunc", "load", "wu_palmer"]
8
+
7
9
 
8
10
  class SerializedNode(TypedDict, total=False):
9
11
  key: str
@@ -11,7 +13,7 @@ class SerializedNode(TypedDict, total=False):
11
13
  children: list["SerializedNode | str"]
12
14
 
13
15
 
14
- @dataclass
16
+ @dataclass(slots=True)
15
17
  class TaxonomyNode:
16
18
  key: str
17
19
  weight: float | None
@@ -21,6 +23,8 @@ class TaxonomyNode:
21
23
 
22
24
 
23
25
  class Taxonomy:
26
+ __slots__ = ("root", "nodes")
27
+
24
28
  root: TaxonomyNode
25
29
  nodes: dict[str, TaxonomyNode]
26
30
 
@@ -74,6 +78,15 @@ class TaxonomyFunc(Protocol):
74
78
 
75
79
 
76
80
  def wu_palmer() -> TaxonomyFunc:
81
+ """Wu & Palmer similarity measure of two nodes in a taxonomy.
82
+ >>> taxonomy = Taxonomy("./data/cars-taxonomy.yaml")
83
+ >>> sim = wu_palmer()
84
+ >>> sim(taxonomy, "audi", "porsche")
85
+ 0.5
86
+ >>> sim(taxonomy, "audi", "bmw")
87
+ 0.0
88
+ """
89
+
77
90
  def wrapped_func(taxonomy: Taxonomy, x: str, y: str) -> float:
78
91
  node1 = taxonomy.nodes[x]
79
92
  node2 = taxonomy.nodes[y]
@@ -90,6 +103,13 @@ _taxonomy_func = wu_palmer()
90
103
  def load(
91
104
  path: FilePath, measure: TaxonomyFunc = _taxonomy_func
92
105
  ) -> SimPairFunc[str, float]:
106
+ """Load a taxonomy and return a function that measures the similarity.
107
+ >>> sim = load("./data/cars-taxonomy.yaml", measure=wu_palmer())
108
+ >>> sim("audi", "porsche")
109
+ 0.5
110
+ >>> sim("audi", "bmw")
111
+ 0.0
112
+ """
93
113
  taxonomy = Taxonomy(path)
94
114
 
95
115
  def wrapped_func(x: str, y: str) -> float:
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "cbrkit"
3
- version = "0.2.1"
3
+ version = "0.3.1"
4
4
  description = "Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI."
5
5
  authors = ["Mirko Lenz <mirko@mirkolenz.com>"]
6
6
  license = "MIT"
@@ -40,53 +40,47 @@ cbrkit = "cbrkit.cli:app"
40
40
 
41
41
  [tool.poetry.dependencies]
42
42
  python = ">=3.11, <3.13"
43
+ fastapi = { version = ">=0.100, <1.0", optional = true, extras = ["all"] }
44
+ levenshtein = { version = ">=0.23, <1.0", optional = true }
45
+ nltk = { version = "^3.8", optional = true }
46
+ openai = { version = "^1.5", optional = true }
47
+ orjson = "^3.9"
43
48
  pandas = "^2.1"
49
+ pyarrow = ">=13.0"
44
50
  pyyaml = "^6.0"
45
- orjson = "^3.9"
46
- xmltodict = "^0.13"
47
- pyarrow = "^14.0"
48
- typer = { version = "^0.9", extras = ["all"], optional = true }
49
- fastapi = { version = "^0.104", optional = true, extras = ["all"] }
50
- uvicorn = { version = "^0.24", optional = true, extras = ["standard"] }
51
- spacy = { version = "^3.7", optional = true }
52
- nltk = { version = "^3.8", optional = true }
53
- levenshtein = { version = "^0.23", optional = true }
54
51
  sentence-transformers = { version = "^2.2", optional = true }
55
- openai = { version = "^1.3", optional = true }
52
+ spacy = { version = "^3.7", optional = true }
56
53
  torch = { version = "^2.1.1", optional = true }
57
- transformers = { version = "^4.36", optional = true }
54
+ transformers = { version = "^4.35", optional = true }
55
+ typer = { version = "^0.9", extras = ["all"], optional = true }
56
+ uvicorn = { version = ">=0.24, <1.0", optional = true, extras = ["standard"] }
57
+ xmltodict = "^0.13"
58
58
 
59
59
  [tool.poetry.group.dev.dependencies]
60
- pytest = "^7.4"
60
+ pytest = "^8.0.0"
61
61
  pytest-cov = "^4.1"
62
62
 
63
63
  [tool.poetry.group.docs.dependencies]
64
- pdoc = "^14.1"
64
+ pdoc = "^14.4"
65
65
 
66
66
  [tool.poetry.extras]
67
67
  all = [
68
- "typer",
69
68
  "fastapi",
70
- "uvicorn",
71
- "spacy",
72
- "nltk",
73
69
  "levenshtein",
74
- "sentence-transformers",
70
+ "nltk",
75
71
  "openai",
72
+ "sentence-transformers",
73
+ "spacy",
74
+ "spacy",
76
75
  "torch",
77
76
  "transformers",
77
+ "typer",
78
+ "uvicorn",
78
79
  ]
79
80
  cli = ["typer"]
80
81
  api = ["fastapi", "uvicorn"]
81
- nlp = [
82
- "spacy",
83
- "nltk",
84
- "levenshtein",
85
- "sentence-transformers",
86
- "openai",
87
- "torch",
88
- "transformers",
89
- ]
82
+ nlp = ["levenshtein", "nltk", "openai", "spacy"]
83
+ transformers = ["sentence-transformers", "torch", "transformers"]
90
84
 
91
85
  [tool.pytest.ini_options]
92
86
  addopts = "--cov cbrkit --cov-report term-missing --doctest-modules --ignore cbrkit/cli.py --ignore cbrkit/api.py --ignore result"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes