cbrkit 1.0.0__tar.gz → 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. cbrkit-1.0.0/README.md → cbrkit-1.2.0/PKG-INFO +491 -63
  2. cbrkit-1.0.0/PKG-INFO → cbrkit-1.2.0/README.md +390 -148
  3. {cbrkit-1.0.0 → cbrkit-1.2.0}/pyproject.toml +45 -38
  4. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/__init__.py +8 -2
  5. cbrkit-1.2.0/src/cbrkit/adapt/__init__.py +39 -0
  6. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/adapt/attribute_value.py +13 -20
  7. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/adapt/generic.py +7 -4
  8. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/adapt/numbers.py +4 -4
  9. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/adapt/strings.py +1 -0
  10. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/api.py +45 -9
  11. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/cli.py +57 -25
  12. cbrkit-1.2.0/src/cbrkit/cycle.py +128 -0
  13. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/dumpers.py +11 -8
  14. cbrkit-1.2.0/src/cbrkit/eval/__init__.py +61 -0
  15. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/eval/common.py +113 -10
  16. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/eval/retrieval.py +4 -0
  17. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/helpers.py +162 -56
  18. cbrkit-1.2.0/src/cbrkit/indexable.py +717 -0
  19. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/loaders.py +124 -18
  20. cbrkit-1.2.0/src/cbrkit/model/__init__.py +48 -0
  21. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/model/graph.py +48 -19
  22. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/model/result.py +50 -28
  23. cbrkit-1.2.0/src/cbrkit/retain/__init__.py +68 -0
  24. cbrkit-1.2.0/src/cbrkit/retain/apply.py +155 -0
  25. cbrkit-1.2.0/src/cbrkit/retain/build.py +151 -0
  26. cbrkit-1.2.0/src/cbrkit/retain/storage.py +135 -0
  27. cbrkit-1.2.0/src/cbrkit/retrieval/__init__.py +112 -0
  28. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/retrieval/apply.py +53 -14
  29. cbrkit-1.2.0/src/cbrkit/retrieval/build.py +108 -0
  30. cbrkit-1.2.0/src/cbrkit/retrieval/indexable.py +1050 -0
  31. cbrkit-1.2.0/src/cbrkit/retrieval/rerank.py +219 -0
  32. cbrkit-1.2.0/src/cbrkit/retrieval/wrappers.py +492 -0
  33. cbrkit-1.2.0/src/cbrkit/reuse/__init__.py +52 -0
  34. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/reuse/apply.py +11 -4
  35. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/reuse/build.py +59 -17
  36. cbrkit-1.2.0/src/cbrkit/revise/__init__.py +50 -0
  37. cbrkit-1.2.0/src/cbrkit/revise/apply.py +155 -0
  38. cbrkit-1.2.0/src/cbrkit/revise/build.py +141 -0
  39. cbrkit-1.2.0/src/cbrkit/sim/__init__.py +78 -0
  40. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/aggregator.py +13 -10
  41. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/attribute_value.py +24 -9
  42. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/collections.py +8 -2
  43. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/embed.py +435 -73
  44. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/generic.py +7 -5
  45. cbrkit-1.2.0/src/cbrkit/sim/graphs/__init__.py +88 -0
  46. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/graphs/alignment.py +10 -7
  47. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/graphs/astar.py +23 -4
  48. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/graphs/brute_force.py +1 -0
  49. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/graphs/common.py +62 -14
  50. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/graphs/dfs.py +6 -2
  51. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/graphs/lap.py +7 -1
  52. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/graphs/vf2.py +12 -4
  53. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/numbers.py +4 -4
  54. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/strings.py +4 -2
  55. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/taxonomy.py +17 -6
  56. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/wrappers.py +55 -21
  57. cbrkit-1.2.0/src/cbrkit/synthesis/__init__.py +60 -0
  58. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/synthesis/apply.py +4 -2
  59. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/synthesis/build.py +18 -2
  60. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/synthesis/model.py +13 -0
  61. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/synthesis/prompts.py +19 -9
  62. cbrkit-1.2.0/src/cbrkit/synthesis/providers/__init__.py +80 -0
  63. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/synthesis/providers/anthropic.py +5 -2
  64. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/synthesis/providers/cohere.py +4 -0
  65. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/synthesis/providers/google.py +2 -0
  66. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/synthesis/providers/instructor.py +2 -0
  67. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/synthesis/providers/model.py +9 -0
  68. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/synthesis/providers/ollama.py +2 -0
  69. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/synthesis/providers/openai_agents.py +9 -7
  70. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/synthesis/providers/openai_completions.py +4 -1
  71. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/synthesis/providers/openai_responses.py +5 -3
  72. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/synthesis/providers/pydantic_ai.py +9 -13
  73. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/synthesis/providers/wrappers.py +4 -0
  74. cbrkit-1.2.0/src/cbrkit/system.py +199 -0
  75. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/typing.py +98 -21
  76. cbrkit-1.0.0/src/cbrkit/adapt/__init__.py +0 -18
  77. cbrkit-1.0.0/src/cbrkit/cycle.py +0 -60
  78. cbrkit-1.0.0/src/cbrkit/eval/__init__.py +0 -31
  79. cbrkit-1.0.0/src/cbrkit/model/__init__.py +0 -11
  80. cbrkit-1.0.0/src/cbrkit/retrieval/__init__.py +0 -39
  81. cbrkit-1.0.0/src/cbrkit/retrieval/build.py +0 -362
  82. cbrkit-1.0.0/src/cbrkit/retrieval/rerank.py +0 -347
  83. cbrkit-1.0.0/src/cbrkit/reuse/__init__.py +0 -21
  84. cbrkit-1.0.0/src/cbrkit/sim/__init__.py +0 -48
  85. cbrkit-1.0.0/src/cbrkit/sim/graphs/__init__.py +0 -50
  86. cbrkit-1.0.0/src/cbrkit/sim/graphs/precompute.py +0 -80
  87. cbrkit-1.0.0/src/cbrkit/synthesis/__init__.py +0 -27
  88. cbrkit-1.0.0/src/cbrkit/synthesis/providers/__init__.py +0 -43
  89. cbrkit-1.0.0/src/cbrkit/system.py +0 -100
  90. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/__main__.py +0 -0
  91. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/constants.py +0 -0
  92. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/py.typed +0 -0
  93. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/graphs/greedy.py +0 -0
  94. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/graphs/qap.py +0 -0
  95. {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/pooling.py +0 -0
@@ -1,3 +1,104 @@
1
+ Metadata-Version: 2.3
2
+ Name: cbrkit
3
+ Version: 1.2.0
4
+ Summary: Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI
5
+ Keywords: cbr,case-based reasoning,api,similarity,nlp,retrieval,cli,tool,library
6
+ Author: Mirko Lenz
7
+ Author-email: Mirko Lenz <mirko@mirkolenz.com>
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Environment :: Console
10
+ Classifier: Framework :: Pytest
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Natural Language :: English
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Classifier: Programming Language :: Python :: 3.14
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
21
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
+ Classifier: Topic :: Utilities
23
+ Classifier: Typing :: Typed
24
+ Requires-Dist: frozendict>=2,<3
25
+ Requires-Dist: numpy>=2,<3
26
+ Requires-Dist: orjson>=3,<4
27
+ Requires-Dist: polars>=1,<2
28
+ Requires-Dist: pydantic>=2,<3
29
+ Requires-Dist: pyyaml>=6,<7
30
+ Requires-Dist: rtoml>=0.12,<1
31
+ Requires-Dist: scipy>=1,<2
32
+ Requires-Dist: xmltodict>=1,<2
33
+ Requires-Dist: cbrkit[anthropic,api,bm25,chromadb,chunking,cohere,eval,google,graphs,graphviz,instructor,lancedb,levenshtein,nltk,ollama,openai,openai-agents,pandas,pydantic-ai,spacy,sql,timeseries,transformers,voyageai,zvec] ; extra == 'all'
34
+ Requires-Dist: anthropic>=0.40,<1 ; extra == 'anthropic'
35
+ Requires-Dist: cbrkit[cli] ; extra == 'api'
36
+ Requires-Dist: fastapi>=0.100,<1 ; extra == 'api'
37
+ Requires-Dist: pydantic-settings>=2,<3 ; extra == 'api'
38
+ Requires-Dist: python-multipart>=0.0.15,<1 ; extra == 'api'
39
+ Requires-Dist: uvicorn[standard]>=0.30,<1 ; extra == 'api'
40
+ Requires-Dist: fastmcp>=3,<4 ; extra == 'api'
41
+ Requires-Dist: bm25s[core,stem,indexing]>=0.3,<1 ; extra == 'bm25'
42
+ Requires-Dist: chromadb>=1,<2 ; extra == 'chromadb'
43
+ Requires-Dist: chonkie>=1,<2 ; extra == 'chunking'
44
+ Requires-Dist: rich>=13,<15 ; extra == 'cli'
45
+ Requires-Dist: typer>=0.9,<1 ; extra == 'cli'
46
+ Requires-Dist: cohere>=5,<6 ; extra == 'cohere'
47
+ Requires-Dist: ranx>=0.3,<1 ; extra == 'eval'
48
+ Requires-Dist: google-genai>=1,<2 ; extra == 'google'
49
+ Requires-Dist: networkx>=3,<4 ; extra == 'graphs'
50
+ Requires-Dist: rustworkx>=0.15,<1 ; extra == 'graphs'
51
+ Requires-Dist: pygraphviz>=1,<2 ; extra == 'graphviz'
52
+ Requires-Dist: instructor>=1,<2 ; extra == 'instructor'
53
+ Requires-Dist: lancedb>=0.20,<1 ; extra == 'lancedb'
54
+ Requires-Dist: levenshtein>=0.26,<1 ; extra == 'levenshtein'
55
+ Requires-Dist: nltk>=3,<4 ; extra == 'nltk'
56
+ Requires-Dist: ollama>=0.3,<1 ; extra == 'ollama'
57
+ Requires-Dist: openai>=1,<3 ; extra == 'openai'
58
+ Requires-Dist: tiktoken>=0.8,<1 ; extra == 'openai'
59
+ Requires-Dist: openai-agents>=0.2,<1 ; extra == 'openai-agents'
60
+ Requires-Dist: pandas>=2,<4 ; extra == 'pandas'
61
+ Requires-Dist: pydantic-ai-slim>=1,<2 ; extra == 'pydantic-ai'
62
+ Requires-Dist: spacy>=3.8,<4 ; extra == 'spacy'
63
+ Requires-Dist: sqlalchemy>=2,<3 ; extra == 'sql'
64
+ Requires-Dist: minineedle>=3,<4 ; extra == 'timeseries'
65
+ Requires-Dist: sentence-transformers>=4,<6 ; extra == 'transformers'
66
+ Requires-Dist: torch>=2.5,<3 ; extra == 'transformers'
67
+ Requires-Dist: transformers>=4,<6 ; extra == 'transformers'
68
+ Requires-Dist: voyageai>=0.3,<1 ; extra == 'voyageai'
69
+ Requires-Python: >=3.13, <4
70
+ Project-URL: Repository, https://github.com/wi2trier/cbrkit
71
+ Project-URL: Documentation, https://wi2trier.github.io/cbrkit/
72
+ Project-URL: Issues, https://github.com/wi2trier/cbrkit/issues
73
+ Project-URL: Changelog, https://github.com/wi2trier/cbrkit/releases
74
+ Provides-Extra: all
75
+ Provides-Extra: anthropic
76
+ Provides-Extra: api
77
+ Provides-Extra: bm25
78
+ Provides-Extra: chromadb
79
+ Provides-Extra: chunking
80
+ Provides-Extra: cli
81
+ Provides-Extra: cohere
82
+ Provides-Extra: eval
83
+ Provides-Extra: google
84
+ Provides-Extra: graphs
85
+ Provides-Extra: graphviz
86
+ Provides-Extra: instructor
87
+ Provides-Extra: lancedb
88
+ Provides-Extra: levenshtein
89
+ Provides-Extra: nltk
90
+ Provides-Extra: ollama
91
+ Provides-Extra: openai
92
+ Provides-Extra: openai-agents
93
+ Provides-Extra: pandas
94
+ Provides-Extra: pydantic-ai
95
+ Provides-Extra: spacy
96
+ Provides-Extra: sql
97
+ Provides-Extra: timeseries
98
+ Provides-Extra: transformers
99
+ Provides-Extra: voyageai
100
+ Description-Content-Type: text/markdown
101
+
1
102
  <!-- markdownlint-disable MD033 MD041 -->
2
103
  <h1><p align="center">CBRkit</p></h1>
3
104
 
@@ -36,26 +137,30 @@ To get started, we provide a [demo project](https://github.com/wi2trier/cbrkit-d
36
137
  Further examples can be found in our [tests](./tests/test_retrieve.py) and [documentation](https://wi2trier.github.io/cbrkit/).
37
138
  The following modules are part of CBRkit:
38
139
 
39
- - `cbrkit.loaders` and `cbrkit.dumpers`: Functions for loading and exporting cases and queries.
40
- - `cbrkit.sim`: Similarity functions for common data types and some utility functions such as `cache`, `combine`, `transpose`, etc.
41
- - `cbrkit.sim.strings`: String similarity measures (Levenshtein, Jaro, semantic, etc.).
140
+ - `cbrkit.loaders`: Functions for loading cases and queries from various file formats and data sources.
141
+ - `cbrkit.dumpers`: Functions for exporting data to JSON, YAML, CSV, TOML, and Markdown.
142
+ - `cbrkit.sim`: Similarity measures for common data types with utility functions such as `cache`, `combine`, `transpose`, etc.
143
+ - `cbrkit.sim.strings`: String similarity measures (Levenshtein, Jaro, spaCy, etc.).
42
144
  - `cbrkit.sim.numbers`: Numeric similarity measures (linear, exponential, threshold).
43
- - `cbrkit.sim.collections`: Similarity measures for collections and sequences (Jaccard, DTW, Smith-Waterman).
145
+ - `cbrkit.sim.collections`: Similarity measures for collections and sequences (Jaccard, etc.).
44
146
  - `cbrkit.sim.embed`: Embedding-based similarity functions with caching support.
45
- - `cbrkit.sim.graphs`: Graph similarity algorithms including GED, A*, VF2, and more.
46
- - `cbrkit.sim.taxonomy`: Taxonomy-based similarity functions.
147
+ - `cbrkit.sim.graphs`: Graph similarity algorithms including A\*, VF2, greedy, LAP, and more.
148
+ - `cbrkit.sim.taxonomy`: Taxonomy-based similarity functions (Wu-Palmer, etc.).
47
149
  - `cbrkit.sim.generic`: Generic similarity functions (equality, tables, static).
48
150
  - `cbrkit.sim.attribute_value`: Similarity for attribute-value based data.
49
151
  - `cbrkit.sim.pooling`: Functions for aggregating multiple similarity values.
50
152
  - `cbrkit.sim.aggregator`: Combines multiple local measures into global scores.
51
- - `cbrkit.retrieval`: Functions for defining and applying retrieval pipelines, includes BM25 retrieval, rerankers, etc.
52
- - `cbrkit.adapt`: Adaptation generator functions for adapting cases based on a query.
53
- - `cbrkit.reuse`: Functions for defining and applying reuse pipelines.
153
+ - `cbrkit.adapt`: Adaptation functions for adapting cases based on a query.
154
+ - `cbrkit.retrieval`: Retrieval pipelines with BM25, embedding-based retrieval, re-ranking (Cohere, Voyage AI, Sentence Transformers), and more.
155
+ - `cbrkit.reuse`: Reuse pipelines that apply adaptation and score the results.
156
+ - `cbrkit.revise`: Revision pipelines for assessing and optionally repairing solutions.
157
+ - `cbrkit.retain`: Retention pipelines for storing solved cases back into the casebase.
158
+ - `cbrkit.cycle`: Full CBR cycle orchestration across all four phases.
159
+ - `cbrkit.system`: High-level `System` class for composing all CBR phases into a single object.
160
+ - `cbrkit.synthesis`: LLM-based synthesis for generating insights from cases (RAG), with providers for OpenAI, Anthropic, Cohere, Google, Ollama, and more.
54
161
  - `cbrkit.eval`: Evaluation metrics for retrieval results including precision, recall, and custom metrics.
55
- - `cbrkit.model`: Data models for graphs and results.
56
- - `cbrkit.cycle`: CBR cycle implementation.
162
+ - `cbrkit.model`: Data models for results and graph structures.
57
163
  - `cbrkit.typing`: Generic type definitions for defining custom functions.
58
- - `cbrkit.synthesis`: Functions for working on a casebase with LLMs to create new insights, e.g., in a RAG context.
59
164
 
60
165
  ## Installation
61
166
 
@@ -74,14 +179,12 @@ pip install cbrkit[EXTRA_NAME,...]
74
179
  where `EXTRA_NAME` is one of the following:
75
180
 
76
181
  - `all`: All optional dependencies
77
- - `api`: REST API Server
78
- - `cli`: Command Line Interface (CLI)
79
- - `eval`: Evaluation tools for common metrics like `precision` and `recall`
80
- - `graphs`: Graph libraries like `networkx` and `rustworkx`
81
- - `llm`: Large Language Models (LLM) APIs like Ollama and OpenAI
82
- - `nlp`: Standalone NLP tools `levenshtein`, `nltk`, `openai`, and `spacy`
83
- - `timeseries`: Time series similarity measures like `dtw` and `smith_waterman`
84
- - `transformers`: Advanced NLP tools based on `pytorch` and `transformers`
182
+ - **LLM providers:** `anthropic`, `cohere`, `google`, `ollama`, `openai`, `openai-agents`, `pydantic-ai`, `instructor`, `voyageai`
183
+ - **NLP / text processing:** `bm25`, `chunking`, `levenshtein`, `nltk`, `spacy`
184
+ - **ML / embeddings:** `transformers` (includes `pytorch` and `sentence-transformers`)
185
+ - **Graphs:** `graphs` (`networkx` and `rustworkx`), `graphviz`
186
+ - **Data backends:** `chromadb`, `lancedb`, `pandas`, `sql` (SQLAlchemy), `zvec`
187
+ - **Tools:** `cli` (CLI), `api` (REST API server), `eval` (evaluation metrics), `timeseries` (DTW, Smith-Waterman)
85
188
 
86
189
  Alternatively, you can also clone this git repository and install CBRKit and its dependencies via uv: `uv sync --all-extras`
87
190
 
@@ -95,7 +198,8 @@ We provide predefined functions for the following formats:
95
198
  - toml
96
199
  - xml
97
200
  - yaml
98
- - py (object inside of a python file).
201
+ - txt (plain text)
202
+ - py (object inside of a python file)
99
203
 
100
204
  Loading one of those formats can be done via the `file` function:
101
205
 
@@ -104,8 +208,18 @@ import cbrkit
104
208
  casebase = cbrkit.loaders.file("path/to/cases.[json,toml,yaml,xml,csv]")
105
209
  ```
106
210
 
107
- Additionally, CBRkit also integrates with `polars` and `pandas` for loading data frames.
108
- The following example shows how to load cases and queries from a CSV file using `polars`:
211
+ You can also load all files from a directory or use the unified `path` function:
212
+
213
+ ```python
214
+ # Load all files matching a glob pattern from a directory
215
+ casebase = cbrkit.loaders.directory("path/to/cases/", pattern="*.json")
216
+
217
+ # Unified path function: auto-detects whether path is a file or directory
218
+ casebase = cbrkit.loaders.path("path/to/cases.json") # single file
219
+ casebase = cbrkit.loaders.path("path/to/cases/") # directory
220
+ ```
221
+
222
+ Additionally, CBRkit integrates with `polars` and `pandas` for loading data frames:
109
223
 
110
224
  ```python
111
225
  import polars as pl
@@ -115,6 +229,25 @@ df = pl.read_csv("path/to/cases.csv")
115
229
  casebase = cbrkit.loaders.polars(df)
116
230
  ```
117
231
 
232
+ For database access, CBRkit provides `sqlite` and `sqlalchemy` loaders (the latter requires the `sql` extra):
233
+
234
+ ```python
235
+ casebase = cbrkit.loaders.sqlite("path/to/database.db", "SELECT * FROM cases")
236
+ ```
237
+
238
+ **Tip:** You can validate a loaded casebase against a Pydantic model using `cbrkit.loaders.validate()`:
239
+
240
+ ```python
241
+ from pydantic import BaseModel
242
+
243
+ class Car(BaseModel):
244
+ price: int
245
+ year: int
246
+ model: str
247
+
248
+ casebase = cbrkit.loaders.validate(casebase, Car)
249
+ ```
250
+
118
251
  ## Defining Queries
119
252
 
120
253
  CBRkit expects the type of the queries to match the type of the cases.
@@ -139,6 +272,29 @@ In case your query collection only contains a single entry, you can use the `sin
139
272
  query = cbrkit.helpers.singleton(queries)
140
273
  ```
141
274
 
275
+ ## Exporting Data
276
+
277
+ CBRkit provides functions for exporting data through the `cbrkit.dumpers` module.
278
+ Supported formats include JSON, YAML, CSV, TOML, and Markdown.
279
+
280
+ ```python
281
+ import cbrkit
282
+
283
+ # Export to a file (format is inferred from the extension)
284
+ cbrkit.dumpers.file("output.json", data)
285
+ cbrkit.dumpers.file("output.yaml", data)
286
+
287
+ # Export to a directory (one file per entry)
288
+ cbrkit.dumpers.directory("output/", data)
289
+
290
+ # Or use the unified path function
291
+ cbrkit.dumpers.path("output.json", data) # writes a single file
292
+ cbrkit.dumpers.path("output/", data) # writes a directory
293
+
294
+ # Format data as a Markdown code block
295
+ md = cbrkit.dumpers.markdown()(data)
296
+ ```
297
+
142
298
  ## Similarity Measures and Aggregation
143
299
 
144
300
  The next step is to define similarity measures for the cases and queries.
@@ -229,6 +385,21 @@ cached_sim = cbrkit.sim.embed.build(
229
385
  )
230
386
  ```
231
387
 
388
+ #### Collection and Sequence Similarity
389
+
390
+ CBRkit provides similarity measures for collections and sequences in `cbrkit.sim.collections`:
391
+
392
+ ```python
393
+ # Jaccard similarity for sets (requires the `nltk` extra)
394
+ jaccard_sim = cbrkit.sim.collections.jaccard()
395
+
396
+ # Optimal sequence mapping using A* search
397
+ seq_sim = cbrkit.sim.collections.mapping(cbrkit.sim.generic.equality())
398
+ ```
399
+
400
+ Dynamic Time Warping and Smith-Waterman alignment are available with the `timeseries` extra.
401
+ See the [module documentation](https://wi2trier.github.io/cbrkit/cbrkit/sim/collections.html) for the full list.
402
+
232
403
  #### Taxonomy-Based Similarity
233
404
 
234
405
  ```python
@@ -269,20 +440,15 @@ CBRkit provides extensive support for graph similarity through various algorithm
269
440
 
270
441
  ```python
271
442
  # Using Graph Edit Distance (GED) with A* search
272
- graph_sim = cbrkit.sim.graphs.astar(
273
- node_sim=cbrkit.sim.generic.equality(),
443
+ graph_sim = cbrkit.sim.graphs.astar.build(
444
+ node_sim_func=cbrkit.sim.generic.equality(),
274
445
  node_matcher=lambda n1, n2: n1 == n2,
275
- edge_matcher=lambda e1, e2: e1 == e2
446
+ edge_matcher=lambda e1, e2: e1 == e2,
276
447
  )
277
448
  ```
278
449
 
279
- Available graph algorithms include:
280
- - `astar`: A* search for optimal graph edit distance
281
- - `vf2`: VF2 algorithm for (sub)graph isomorphism
282
- - `lap`: Linear assignment problem solver
283
- - `greedy`: Fast greedy matching
284
- - `brute_force`: Exhaustive search for small graphs
285
- - `dfs`: Depth-first search based matching
450
+ Available graph algorithms include `astar`, `vf2`, `greedy`, `lap`, `brute_force`, `dfs`, `dtw`, and `smith_waterman`.
451
+ See the [module documentation](https://wi2trier.github.io/cbrkit/cbrkit/sim/graphs.html) for a full list of algorithms and their parameters.
286
452
 
287
453
  ### Global Similarity and Aggregation
288
454
 
@@ -333,9 +499,30 @@ cbrkit.sim.attribute_value(
333
499
  )
334
500
  ```
335
501
 
502
+ ## CBR Cycle Phases
503
+
504
+ All four phases of the CBR cycle — retrieval, reuse, revise, and retain — follow the same unified protocol `CbrFunc` (defined in `cbrkit.typing`).
505
+ Each phase function takes a casebase and a query, and returns an updated casebase together with a score map.
506
+ The casebase in the output may differ from the input depending on the phase (e.g., adapted cases in reuse, newly stored cases in retain).
507
+ The score map assigns a floating-point score to each case in the output casebase, with phase-specific semantics:
508
+
509
+ - **Retrieval**: Similarity scores between cases and the query.
510
+ - **Reuse**: Quality scores of adapted cases compared to the query.
511
+ - **Revise**: Assessment scores evaluating solution correctness.
512
+ - **Retain**: Fitness scores for retained cases.
513
+
514
+ This uniform interface makes it easy to compose phases into pipelines and to swap implementations.
515
+ The phase-specific type aliases `RetrieverFunc`, `ReuserFunc`, `ReviserFunc`, and `RetainerFunc` are provided for clarity but are structurally identical to `CbrFunc`.
516
+
517
+ Each phase result has the following attributes:
518
+
519
+ - `similarities`: A dictionary containing the scores for each case.
520
+ - `ranking`: A list of case keys sorted by their score.
521
+ - `casebase`: The casebase containing the output cases.
522
+
336
523
  ## Retrieval
337
524
 
338
- The final step is to retrieve cases based on the loaded queries.
525
+ The first phase is to retrieve cases based on the loaded queries.
339
526
  The `cbrkit.retrieval` module provides utility functions for this purpose.
340
527
  You first build a retrieval pipeline by specifying a global similarity function and optionally a limit for the number of retrieved cases.
341
528
 
@@ -439,50 +626,177 @@ An overview of all available adaptation functions can be found in the [module do
439
626
 
440
627
  ## Reuse
441
628
 
442
- The reuse phase applies adaptation functions to retrieved cases. The `cbrkit.reuse` module provides utility functions for this purpose. You first build a reuse pipeline by specifying a global adaptation function:
629
+ The reuse phase applies adaptation functions to retrieved cases and scores the adapted results.
630
+ The `cbrkit.reuse` module provides utility functions for this purpose.
631
+ You build a reuse pipeline by specifying an adaptation function and a similarity function:
443
632
 
444
633
  ```python
445
634
  reuser = cbrkit.reuse.build(
446
- cbrkit.adapt.attribute_value(...),
635
+ adaptation_func=cbrkit.adapt.attribute_value(...),
636
+ similarity_func=cbrkit.sim.attribute_value(...),
447
637
  )
448
638
  ```
449
639
 
450
- This reuser can then be applied to the retrieval result to adapt cases based on a query:
640
+ This reuser can then be applied to a retrieval result to adapt cases based on a query:
451
641
 
452
642
  ```python
453
- result = cbrkit.reuse.apply_query(retrieval_result, query, reuser)
643
+ result = cbrkit.reuse.apply_result(retrieval_result, reuser)
454
644
  ```
455
645
 
456
- Our result has the following attributes:
457
-
458
- - `adaptations`: A dictionary containing the adapted values for each case.
459
- - `ranking`: A list of case indices matching the retrieval result.
460
- - `casebase`: The casebase containing only the adapted cases.
646
+ As with all CBR phases, the result contains `similarities` (quality scores of adapted cases), `ranking`, and `casebase` (containing the adapted cases).
461
647
 
462
648
  Multiple reuse pipelines can be combined by passing them as a list or tuple:
463
649
 
464
650
  ```python
465
651
  reuser1 = cbrkit.reuse.build(...)
466
652
  reuser2 = cbrkit.reuse.build(...)
467
- result = cbrkit.reuse.apply_query(retrieval_result, query, (reuser1, reuser2))
653
+ result = cbrkit.reuse.apply_result(retrieval_result, (reuser1, reuser2))
468
654
  ```
469
655
 
470
656
  The result structure follows the same pattern as the retrieval results with `final_step` and `steps` attributes.
471
657
 
658
+ ## Revise
659
+
660
+ The revise phase assesses the quality of solutions produced by the reuse phase and optionally repairs them.
661
+ The `cbrkit.revise` module provides utility functions for this purpose.
662
+ You build a revise pipeline by specifying an assessment function and an optional repair function:
663
+
664
+ ```python
665
+ reviser = cbrkit.revise.build(
666
+ assess_func=cbrkit.sim.attribute_value(...),
667
+ repair_func=some_adaptation_func, # optional
668
+ )
669
+ ```
670
+
671
+ The reviser can be applied to a reuse result:
672
+
673
+ ```python
674
+ result = cbrkit.revise.apply_result(reuse_result, reviser)
675
+ ```
676
+
677
+ When a `repair_func` is provided, solutions are repaired before assessment.
678
+ The result contains `similarities` with quality assessment scores for each case.
679
+
680
+ ## Retain
681
+
682
+ The retain phase decides whether and how to integrate new cases into the casebase.
683
+ The `cbrkit.retain` module provides utility functions for this purpose.
684
+ You build a retain pipeline by specifying an assessment function and a storage function:
685
+
686
+ ```python
687
+ retainer = cbrkit.retain.build(
688
+ assess_func=cbrkit.sim.generic.equality(),
689
+ storage_func=cbrkit.retain.static(
690
+ key_func=lambda keys: max(keys, default=-1) + 1,
691
+ casebase=casebase,
692
+ ),
693
+ )
694
+ ```
695
+
696
+ CBRkit provides several built-in storage functions:
697
+
698
+ - `static`: Generates keys from a fixed reference casebase to avoid collisions.
699
+ - `indexable`: Keeps an `IndexableFunc`'s index in sync with the casebase.
700
+
701
+ You can filter retained cases based on their assessment scores using the `dropout` wrapper:
702
+
703
+ ```python
704
+ retainer = cbrkit.retain.dropout(
705
+ retainer_func=cbrkit.retain.build(...),
706
+ min_similarity=0.5,
707
+ )
708
+ ```
709
+
710
+ The retainer can be applied to a revise result:
711
+
712
+ ```python
713
+ result = cbrkit.retain.apply_result(revise_result, retainer)
714
+ ```
715
+
716
+ The result contains `similarities` with fitness scores and `casebase` with the updated cases.
717
+
718
+ ## Full CBR Cycle
719
+
720
+ The `cbrkit.cycle` module orchestrates all four phases (retrieval, reuse, revise, retain) in a single call.
721
+ This is useful when you want to run the complete CBR cycle without manually chaining the phases.
722
+
723
+ ```python
724
+ result = cbrkit.cycle.apply_query(
725
+ casebase,
726
+ query,
727
+ retrievers=retriever,
728
+ reusers=reuser,
729
+ revisers=reviser,
730
+ retainers=retainer,
731
+ )
732
+ # Access results from each phase
733
+ retrieval_result = result.retrieval
734
+ reuse_result = result.reuse
735
+ revise_result = result.revise
736
+ retain_result = result.retain
737
+ ```
738
+
739
+ For multiple queries, use `cbrkit.cycle.apply_queries` or `cbrkit.cycle.apply_batches`.
740
+
741
+ ## System
742
+
743
+ The `cbrkit.system.System` class provides a high-level interface for composing all CBR phases into a single reusable object.
744
+ It is especially useful for integrating CBRkit into applications where the casebase and phase functions are configured once and reused across multiple queries.
745
+
746
+ ```python
747
+ system = cbrkit.system.System(
748
+ casebase=casebase,
749
+ retriever_factory=lambda config: retriever,
750
+ reuser_factory=lambda config: reuser,
751
+ )
752
+
753
+ # Run individual phases
754
+ retrieval_result = system.retrieve(query)
755
+ reuse_result = system.reuse(query)
756
+
757
+ # Run the full cycle
758
+ cycle_result = system.cycle(query)
759
+ ```
760
+
761
+ The `System` class supports optional configuration parameters for each phase factory, allowing you to customize the behavior per query.
762
+
472
763
  ## Advanced Retrieval
473
764
 
474
765
  ### BM25 Retrieval
475
766
 
476
- CBRkit includes a BM25 retriever for text-based retrieval:
767
+ CBRkit includes a BM25 retriever for sparse text-based retrieval (requires the `bm25` extra).
768
+ The BM25 retriever delegates text tokenization to a `cbrkit.sim.embed.bm25` embedding function:
477
769
 
478
770
  ```python
479
- retriever = cbrkit.retrieval.bm25(
480
- key="text_field", # Field to search in
481
- limit=10
771
+ bm25_func = cbrkit.sim.embed.bm25(language="en")
772
+ retriever = cbrkit.retrieval.dropout(
773
+ cbrkit.retrieval.bm25(conversion_func=bm25_func),
774
+ limit=10,
775
+ )
776
+ result = cbrkit.retrieval.apply_query(casebase, query, retriever)
777
+ ```
778
+
779
+ ### Embedding-Based Retrieval
780
+
781
+ CBRkit supports embedding-based retrieval through vector similarity search.
782
+ The `embed` retriever uses an embedding function with caching and a vector scorer:
783
+
784
+ ```python
785
+ embed_func = cbrkit.sim.embed.cache(
786
+ func=cbrkit.sim.embed.sentence_transformers(model="all-MiniLM-L6-v2"),
787
+ path="embeddings.sqlite3",
788
+ table="strf/minilm",
789
+ )
790
+ retriever = cbrkit.retrieval.dropout(
791
+ cbrkit.retrieval.embed(conversion_func=embed_func),
792
+ limit=10,
482
793
  )
483
794
  result = cbrkit.retrieval.apply_query(casebase, query, retriever)
484
795
  ```
485
796
 
797
+ For persistent storage backends, CBRkit also supports `lancedb`, `chromadb`, and `zvec` retrievers (each requires its respective extra).
798
+ These backends manage index persistence and support hybrid search modes.
799
+
486
800
  ### Combining Multiple Retrievers
487
801
 
488
802
  The `combine` function allows merging results from multiple retrievers:
@@ -492,23 +806,78 @@ retriever1 = cbrkit.retrieval.build(...)
492
806
  retriever2 = cbrkit.retrieval.bm25(...)
493
807
 
494
808
  combined = cbrkit.retrieval.combine(
495
- retrievers=[retriever1, retriever2],
496
- aggregator=cbrkit.sim.aggregator(pooling="mean")
809
+ retriever_funcs=[retriever1, retriever2],
810
+ aggregator=cbrkit.sim.aggregator(pooling="mean"),
497
811
  )
498
812
  result = cbrkit.retrieval.apply_query(casebase, query, combined)
499
813
  ```
500
814
 
501
815
  ### Distributed Processing
502
816
 
503
- For large-scale retrieval, use the `distribute` wrapper:
817
+ `build` and `distribute` offer two different levels of parallelism.
818
+ `build(sim_func, multiprocessing=True)` parallelizes the **similarity computations** within batches: all (casebase, query) pairs are flattened into individual comparisons and distributed across processes.
819
+ `distribute(retriever, multiprocessing=True)` parallelizes across **batches**: each (casebase, query) pair is passed to the wrapped retriever as a separate process.
820
+ Use `distribute` when you have many independent queries and want to process them in parallel as separate retrieval tasks:
504
821
 
505
822
  ```python
506
823
  retriever = cbrkit.retrieval.distribute(
507
824
  cbrkit.retrieval.build(...),
508
- batch_size=1000
825
+ multiprocessing=True, # or an integer for a specific number of processes
509
826
  )
510
827
  ```
511
828
 
829
+ ### Re-ranking
830
+
831
+ CBRkit supports re-ranking retrieved results using external models.
832
+ Re-rankers take the initial retrieval results and reorder them based on a more expensive model.
833
+ The following re-rankers are available (each requires its respective extra):
834
+
835
+ - `cbrkit.retrieval.cohere`: Cohere re-ranking (extra: `cohere`)
836
+ - `cbrkit.retrieval.voyageai`: Voyage AI re-ranking (extra: `voyageai`)
837
+ - `cbrkit.retrieval.sentence_transformers`: Sentence Transformers cross-encoder re-ranking (extra: `transformers`)
838
+
839
+ ```python
840
+ reranker = cbrkit.retrieval.cohere(model="rerank-v3.5")
841
+
842
+ # Use as a second-stage retriever in a sequential pipeline
843
+ retriever = cbrkit.retrieval.build(cbrkit.sim.attribute_value(...))
844
+ result = cbrkit.retrieval.apply_query(casebase, query, (retriever, reranker))
845
+ ```
846
+
847
+ ### Indexed Retrieval
848
+
849
+ Some retrievers like `bm25`, `embed`, and `lancedb` support **indexed retrieval**, where the casebase is pre-indexed once and then queried without passing the full casebase each time.
850
+ This is useful for large casebases or when using external search backends.
851
+
852
+ To use indexed retrieval, first create a retriever and call its `index()` method:
853
+
854
+ ```python
855
+ from frozendict import frozendict
856
+
857
+ bm25_func = cbrkit.sim.embed.bm25(language="en")
858
+ retriever = cbrkit.retrieval.bm25(conversion_func=bm25_func)
859
+ retriever.create_index(frozendict(casebase))
860
+ ```
861
+
862
+ Then pass an empty casebase (`{}`) to signal that the retriever should use its pre-indexed data:
863
+
864
+ ```python
865
+ result = cbrkit.retrieval.apply_query({}, query, retriever)
866
+ ```
867
+
868
+ As a convenience, CBRkit provides `apply_query_indexed` and `apply_queries_indexed` which handle the empty casebase automatically:
869
+
870
+ ```python
871
+ result = cbrkit.retrieval.apply_query_indexed(query, retriever)
872
+ # or for multiple queries:
873
+ result = cbrkit.retrieval.apply_queries_indexed(queries, retriever)
874
+ ```
875
+
876
+ If a retriever receives an empty casebase but has not been indexed yet, a `ValueError` is raised with a message to call `index()` first.
877
+
878
+ The `System` class also supports indexed retrieval by defaulting the casebase to an empty dict.
879
+ This allows creating a system where all retrievers are pre-indexed and no casebase needs to be provided at query time.
880
+
512
881
  ## Evaluation
513
882
 
514
883
  CBRkit provides evaluation tools through the `cbrkit.eval` module for assessing the quality of retrieval results.
@@ -559,20 +928,26 @@ All of them can be computed at different cutoff points by appending `@k`, e.g.,
559
928
  We also offer a function to automatically generate a list of metrics for different cutoff points:
560
929
 
561
930
  ```python
562
- metrics = cbrkit.eval.metrics_at_k(["precision", "recall", "f1"], [1, 5, 10])
931
+ metrics = cbrkit.eval.generate_metrics(["precision", "recall", "f1"], ks=[1, 5, 10])
563
932
  ```
564
933
 
565
934
  ## Synthesis
566
935
 
567
936
  In the context of CBRkit, synthesis refers to creating new insights from the cases which were retrieved in a previous retrieval step, for example in a RAG context. CBRkit builds a synthesizer using the function `cbrkit.synthesis.build` with a `provider` and a `prompt`. A synthesizer maps a `Result` (obtained in the retrieval step) to an LLM output (can be a string or structurized). An example can be found in [examples/cars_rag.py](https://github.com/wi2trier/cbrkit/blob/main/examples/cars_rag.py).
568
937
 
569
- The following **providers** are currently supported if a valid API key is stored the respective environment variable:
938
+ The following **providers** are available in `cbrkit.synthesis.providers` (each requires its respective extra):
570
939
 
571
- - Anthropic (`ANTHROPIC_API_KEY`)
572
- - Cohere (`CO_API_KEY`)
573
- - Google (`GOOGLE_API_KEY`)
574
- - Ollama
575
- - OpenAI (`OPENAI_API_KEY`)
940
+ - `openai` / `openai_completions`: OpenAI Completions API (`OPENAI_API_KEY`)
941
+ - `openai_responses`: OpenAI Responses API (`OPENAI_API_KEY`)
942
+ - `openai_agents`: OpenAI Agents framework (`OPENAI_API_KEY`)
943
+ - `anthropic`: Anthropic Claude API (`ANTHROPIC_API_KEY`)
944
+ - `cohere`: Cohere API (`CO_API_KEY`)
945
+ - `google`: Google Generative AI (`GOOGLE_API_KEY`)
946
+ - `ollama`: Ollama (local, no API key needed)
947
+ - `pydantic_ai`: Pydantic AI framework
948
+ - `instructor`: Instructor for structured output
949
+
950
+ Providers can be chained using `cbrkit.synthesis.providers.pipe()` and managed as conversations using `cbrkit.synthesis.providers.conversation()`.
576
951
 
577
952
  The respective provider class in `cbrkit.synthesis.providers` has to be initialized with the model name and a response type (either `str` or a [Pydantic model](https://docs.pydantic.dev/latest/concepts/models/) for structured output). Further model options like `temperature`, `seed`, `max_tokens`, etc. can also be specified here.
578
953
 
@@ -603,16 +978,15 @@ CBRKit's `transpose` prompt allows to transpose cases and queries before they ar
603
978
 
604
979
  ```python
605
980
  from cbrkit.typing import JsonEntry
606
- from cbrkit.dumpers import json_markdown
607
981
 
608
982
  def encoder(value) -> dict:
609
983
  ...
610
984
  baseprompt = cbrkit.synthesis.prompts.default(instructions, encoder=encoder)
611
985
  # transform the entries, e.g., by shortening, leaving out irrelevant attributes, etc.
612
- # In this case, the value of every field is trunctated to 100 characters
986
+ # In this case, the value of every field is truncated to 100 characters
613
987
  def shorten(entry: dict) -> JsonEntry:
614
- entry = {k: str(v)[:100] for k,v in entry.items()}
615
- return json_markdown(entry)
988
+ entry = {k: str(v)[:100] for k, v in entry.items()}
989
+ return cbrkit.dumpers.markdown()(entry)
616
990
 
617
991
  prompt = cbrkit.synthesis.prompts.transpose(baseprompt, shorten)
618
992
  synthesizer = cbrkit.synthesis.build(provider, prompt)
@@ -650,6 +1024,60 @@ response = get_result(batches)
650
1024
 
651
1025
  The complete version of this example can be found under `examples/cars_rag_large.py`.
652
1026
 
1027
+ ## Tips and Common Patterns
1028
+
1029
+ ### Parameter Naming Conventions
1030
+
1031
+ CBRkit inspects function signatures to determine their behavior:
1032
+
1033
+ - **Similarity functions** must use `x` (case) and `y` (query) as parameter names.
1034
+ - **Adaptation functions** must use `case` and `query` for pair functions, or `casebase` and `query` for map/reduce functions.
1035
+ - **Batch functions** accept a list of tuples instead of individual pairs: `f([(x1, y1), (x2, y2), ...])`.
1036
+
1037
+ ### Filtering with `dropout`
1038
+
1039
+ The `dropout` wrapper is the standard way to add limits and thresholds to any retriever or retainer.
1040
+ It supports `limit` (maximum number of results), `min_similarity`, and `max_similarity`:
1041
+
1042
+ ```python
1043
+ retriever = cbrkit.retrieval.dropout(
1044
+ cbrkit.retrieval.build(sim_func),
1045
+ limit=10,
1046
+ min_similarity=0.3,
1047
+ )
1048
+ ```
1049
+
1050
+ ### Composing Multiple Phase Functions
1051
+
1052
+ All CBR phases support sequential composition by passing a tuple of phase functions.
1053
+ Each step receives the output casebase of the previous step, enabling patterns like MAC/FAC:
1054
+
1055
+ ```python
1056
+ result = cbrkit.retrieval.apply_query(casebase, query, (cheap_retriever, expensive_retriever))
1057
+ ```
1058
+
1059
+ ### Using `frozendict` for Immutable Casebases
1060
+
1061
+ Several components (e.g., indexed retrieval, retain phase) benefit from immutable casebases.
1062
+ Use `frozendict` to prevent accidental mutations:
1063
+
1064
+ ```python
1065
+ from frozendict import frozendict
1066
+ casebase = frozendict(cbrkit.loaders.file("cases.json"))
1067
+ ```
1068
+
1069
+ ### Multiprocessing Support
1070
+
1071
+ The `cbrkit.retrieval.build` function supports multiprocessing to parallelize similarity computations within batches:
1072
+
1073
+ ```python
1074
+ retriever = cbrkit.retrieval.build(sim_func, multiprocessing=True)
1075
+ # or with a specific number of processes:
1076
+ retriever = cbrkit.retrieval.build(sim_func, multiprocessing=4)
1077
+ ```
1078
+
1079
+ To parallelize across batches instead, see [Distributed Processing](#distributed-processing).
1080
+
653
1081
  ## Logging
654
1082
 
655
1083
  CBRkit integrates with the `logging` module to provide a unified logging interface.