cbrkit 1.0.0__tar.gz → 1.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cbrkit-1.0.0/README.md → cbrkit-1.2.0/PKG-INFO +491 -63
- cbrkit-1.0.0/PKG-INFO → cbrkit-1.2.0/README.md +390 -148
- {cbrkit-1.0.0 → cbrkit-1.2.0}/pyproject.toml +45 -38
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/__init__.py +8 -2
- cbrkit-1.2.0/src/cbrkit/adapt/__init__.py +39 -0
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/adapt/attribute_value.py +13 -20
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/adapt/generic.py +7 -4
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/adapt/numbers.py +4 -4
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/adapt/strings.py +1 -0
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/api.py +45 -9
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/cli.py +57 -25
- cbrkit-1.2.0/src/cbrkit/cycle.py +128 -0
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/dumpers.py +11 -8
- cbrkit-1.2.0/src/cbrkit/eval/__init__.py +61 -0
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/eval/common.py +113 -10
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/eval/retrieval.py +4 -0
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/helpers.py +162 -56
- cbrkit-1.2.0/src/cbrkit/indexable.py +717 -0
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/loaders.py +124 -18
- cbrkit-1.2.0/src/cbrkit/model/__init__.py +48 -0
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/model/graph.py +48 -19
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/model/result.py +50 -28
- cbrkit-1.2.0/src/cbrkit/retain/__init__.py +68 -0
- cbrkit-1.2.0/src/cbrkit/retain/apply.py +155 -0
- cbrkit-1.2.0/src/cbrkit/retain/build.py +151 -0
- cbrkit-1.2.0/src/cbrkit/retain/storage.py +135 -0
- cbrkit-1.2.0/src/cbrkit/retrieval/__init__.py +112 -0
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/retrieval/apply.py +53 -14
- cbrkit-1.2.0/src/cbrkit/retrieval/build.py +108 -0
- cbrkit-1.2.0/src/cbrkit/retrieval/indexable.py +1050 -0
- cbrkit-1.2.0/src/cbrkit/retrieval/rerank.py +219 -0
- cbrkit-1.2.0/src/cbrkit/retrieval/wrappers.py +492 -0
- cbrkit-1.2.0/src/cbrkit/reuse/__init__.py +52 -0
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/reuse/apply.py +11 -4
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/reuse/build.py +59 -17
- cbrkit-1.2.0/src/cbrkit/revise/__init__.py +50 -0
- cbrkit-1.2.0/src/cbrkit/revise/apply.py +155 -0
- cbrkit-1.2.0/src/cbrkit/revise/build.py +141 -0
- cbrkit-1.2.0/src/cbrkit/sim/__init__.py +78 -0
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/aggregator.py +13 -10
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/attribute_value.py +24 -9
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/collections.py +8 -2
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/embed.py +435 -73
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/generic.py +7 -5
- cbrkit-1.2.0/src/cbrkit/sim/graphs/__init__.py +88 -0
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/graphs/alignment.py +10 -7
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/graphs/astar.py +23 -4
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/graphs/brute_force.py +1 -0
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/graphs/common.py +62 -14
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/graphs/dfs.py +6 -2
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/graphs/lap.py +7 -1
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/graphs/vf2.py +12 -4
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/numbers.py +4 -4
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/strings.py +4 -2
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/taxonomy.py +17 -6
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/wrappers.py +55 -21
- cbrkit-1.2.0/src/cbrkit/synthesis/__init__.py +60 -0
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/synthesis/apply.py +4 -2
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/synthesis/build.py +18 -2
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/synthesis/model.py +13 -0
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/synthesis/prompts.py +19 -9
- cbrkit-1.2.0/src/cbrkit/synthesis/providers/__init__.py +80 -0
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/synthesis/providers/anthropic.py +5 -2
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/synthesis/providers/cohere.py +4 -0
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/synthesis/providers/google.py +2 -0
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/synthesis/providers/instructor.py +2 -0
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/synthesis/providers/model.py +9 -0
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/synthesis/providers/ollama.py +2 -0
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/synthesis/providers/openai_agents.py +9 -7
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/synthesis/providers/openai_completions.py +4 -1
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/synthesis/providers/openai_responses.py +5 -3
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/synthesis/providers/pydantic_ai.py +9 -13
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/synthesis/providers/wrappers.py +4 -0
- cbrkit-1.2.0/src/cbrkit/system.py +199 -0
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/typing.py +98 -21
- cbrkit-1.0.0/src/cbrkit/adapt/__init__.py +0 -18
- cbrkit-1.0.0/src/cbrkit/cycle.py +0 -60
- cbrkit-1.0.0/src/cbrkit/eval/__init__.py +0 -31
- cbrkit-1.0.0/src/cbrkit/model/__init__.py +0 -11
- cbrkit-1.0.0/src/cbrkit/retrieval/__init__.py +0 -39
- cbrkit-1.0.0/src/cbrkit/retrieval/build.py +0 -362
- cbrkit-1.0.0/src/cbrkit/retrieval/rerank.py +0 -347
- cbrkit-1.0.0/src/cbrkit/reuse/__init__.py +0 -21
- cbrkit-1.0.0/src/cbrkit/sim/__init__.py +0 -48
- cbrkit-1.0.0/src/cbrkit/sim/graphs/__init__.py +0 -50
- cbrkit-1.0.0/src/cbrkit/sim/graphs/precompute.py +0 -80
- cbrkit-1.0.0/src/cbrkit/synthesis/__init__.py +0 -27
- cbrkit-1.0.0/src/cbrkit/synthesis/providers/__init__.py +0 -43
- cbrkit-1.0.0/src/cbrkit/system.py +0 -100
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/__main__.py +0 -0
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/constants.py +0 -0
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/py.typed +0 -0
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/graphs/greedy.py +0 -0
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/graphs/qap.py +0 -0
- {cbrkit-1.0.0 → cbrkit-1.2.0}/src/cbrkit/sim/pooling.py +0 -0
|
@@ -1,3 +1,104 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: cbrkit
|
|
3
|
+
Version: 1.2.0
|
|
4
|
+
Summary: Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI
|
|
5
|
+
Keywords: cbr,case-based reasoning,api,similarity,nlp,retrieval,cli,tool,library
|
|
6
|
+
Author: Mirko Lenz
|
|
7
|
+
Author-email: Mirko Lenz <mirko@mirkolenz.com>
|
|
8
|
+
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: Environment :: Console
|
|
10
|
+
Classifier: Framework :: Pytest
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Natural Language :: English
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
21
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
22
|
+
Classifier: Topic :: Utilities
|
|
23
|
+
Classifier: Typing :: Typed
|
|
24
|
+
Requires-Dist: frozendict>=2,<3
|
|
25
|
+
Requires-Dist: numpy>=2,<3
|
|
26
|
+
Requires-Dist: orjson>=3,<4
|
|
27
|
+
Requires-Dist: polars>=1,<2
|
|
28
|
+
Requires-Dist: pydantic>=2,<3
|
|
29
|
+
Requires-Dist: pyyaml>=6,<7
|
|
30
|
+
Requires-Dist: rtoml>=0.12,<1
|
|
31
|
+
Requires-Dist: scipy>=1,<2
|
|
32
|
+
Requires-Dist: xmltodict>=1,<2
|
|
33
|
+
Requires-Dist: cbrkit[anthropic,api,bm25,chromadb,chunking,cohere,eval,google,graphs,graphviz,instructor,lancedb,levenshtein,nltk,ollama,openai,openai-agents,pandas,pydantic-ai,spacy,sql,timeseries,transformers,voyageai,zvec] ; extra == 'all'
|
|
34
|
+
Requires-Dist: anthropic>=0.40,<1 ; extra == 'anthropic'
|
|
35
|
+
Requires-Dist: cbrkit[cli] ; extra == 'api'
|
|
36
|
+
Requires-Dist: fastapi>=0.100,<1 ; extra == 'api'
|
|
37
|
+
Requires-Dist: pydantic-settings>=2,<3 ; extra == 'api'
|
|
38
|
+
Requires-Dist: python-multipart>=0.0.15,<1 ; extra == 'api'
|
|
39
|
+
Requires-Dist: uvicorn[standard]>=0.30,<1 ; extra == 'api'
|
|
40
|
+
Requires-Dist: fastmcp>=3,<4 ; extra == 'api'
|
|
41
|
+
Requires-Dist: bm25s[core,stem,indexing]>=0.3,<1 ; extra == 'bm25'
|
|
42
|
+
Requires-Dist: chromadb>=1,<2 ; extra == 'chromadb'
|
|
43
|
+
Requires-Dist: chonkie>=1,<2 ; extra == 'chunking'
|
|
44
|
+
Requires-Dist: rich>=13,<15 ; extra == 'cli'
|
|
45
|
+
Requires-Dist: typer>=0.9,<1 ; extra == 'cli'
|
|
46
|
+
Requires-Dist: cohere>=5,<6 ; extra == 'cohere'
|
|
47
|
+
Requires-Dist: ranx>=0.3,<1 ; extra == 'eval'
|
|
48
|
+
Requires-Dist: google-genai>=1,<2 ; extra == 'google'
|
|
49
|
+
Requires-Dist: networkx>=3,<4 ; extra == 'graphs'
|
|
50
|
+
Requires-Dist: rustworkx>=0.15,<1 ; extra == 'graphs'
|
|
51
|
+
Requires-Dist: pygraphviz>=1,<2 ; extra == 'graphviz'
|
|
52
|
+
Requires-Dist: instructor>=1,<2 ; extra == 'instructor'
|
|
53
|
+
Requires-Dist: lancedb>=0.20,<1 ; extra == 'lancedb'
|
|
54
|
+
Requires-Dist: levenshtein>=0.26,<1 ; extra == 'levenshtein'
|
|
55
|
+
Requires-Dist: nltk>=3,<4 ; extra == 'nltk'
|
|
56
|
+
Requires-Dist: ollama>=0.3,<1 ; extra == 'ollama'
|
|
57
|
+
Requires-Dist: openai>=1,<3 ; extra == 'openai'
|
|
58
|
+
Requires-Dist: tiktoken>=0.8,<1 ; extra == 'openai'
|
|
59
|
+
Requires-Dist: openai-agents>=0.2,<1 ; extra == 'openai-agents'
|
|
60
|
+
Requires-Dist: pandas>=2,<4 ; extra == 'pandas'
|
|
61
|
+
Requires-Dist: pydantic-ai-slim>=1,<2 ; extra == 'pydantic-ai'
|
|
62
|
+
Requires-Dist: spacy>=3.8,<4 ; extra == 'spacy'
|
|
63
|
+
Requires-Dist: sqlalchemy>=2,<3 ; extra == 'sql'
|
|
64
|
+
Requires-Dist: minineedle>=3,<4 ; extra == 'timeseries'
|
|
65
|
+
Requires-Dist: sentence-transformers>=4,<6 ; extra == 'transformers'
|
|
66
|
+
Requires-Dist: torch>=2.5,<3 ; extra == 'transformers'
|
|
67
|
+
Requires-Dist: transformers>=4,<6 ; extra == 'transformers'
|
|
68
|
+
Requires-Dist: voyageai>=0.3,<1 ; extra == 'voyageai'
|
|
69
|
+
Requires-Python: >=3.13, <4
|
|
70
|
+
Project-URL: Repository, https://github.com/wi2trier/cbrkit
|
|
71
|
+
Project-URL: Documentation, https://wi2trier.github.io/cbrkit/
|
|
72
|
+
Project-URL: Issues, https://github.com/wi2trier/cbrkit/issues
|
|
73
|
+
Project-URL: Changelog, https://github.com/wi2trier/cbrkit/releases
|
|
74
|
+
Provides-Extra: all
|
|
75
|
+
Provides-Extra: anthropic
|
|
76
|
+
Provides-Extra: api
|
|
77
|
+
Provides-Extra: bm25
|
|
78
|
+
Provides-Extra: chromadb
|
|
79
|
+
Provides-Extra: chunking
|
|
80
|
+
Provides-Extra: cli
|
|
81
|
+
Provides-Extra: cohere
|
|
82
|
+
Provides-Extra: eval
|
|
83
|
+
Provides-Extra: google
|
|
84
|
+
Provides-Extra: graphs
|
|
85
|
+
Provides-Extra: graphviz
|
|
86
|
+
Provides-Extra: instructor
|
|
87
|
+
Provides-Extra: lancedb
|
|
88
|
+
Provides-Extra: levenshtein
|
|
89
|
+
Provides-Extra: nltk
|
|
90
|
+
Provides-Extra: ollama
|
|
91
|
+
Provides-Extra: openai
|
|
92
|
+
Provides-Extra: openai-agents
|
|
93
|
+
Provides-Extra: pandas
|
|
94
|
+
Provides-Extra: pydantic-ai
|
|
95
|
+
Provides-Extra: spacy
|
|
96
|
+
Provides-Extra: sql
|
|
97
|
+
Provides-Extra: timeseries
|
|
98
|
+
Provides-Extra: transformers
|
|
99
|
+
Provides-Extra: voyageai
|
|
100
|
+
Description-Content-Type: text/markdown
|
|
101
|
+
|
|
1
102
|
<!-- markdownlint-disable MD033 MD041 -->
|
|
2
103
|
<h1><p align="center">CBRkit</p></h1>
|
|
3
104
|
|
|
@@ -36,26 +137,30 @@ To get started, we provide a [demo project](https://github.com/wi2trier/cbrkit-d
|
|
|
36
137
|
Further examples can be found in our [tests](./tests/test_retrieve.py) and [documentation](https://wi2trier.github.io/cbrkit/).
|
|
37
138
|
The following modules are part of CBRkit:
|
|
38
139
|
|
|
39
|
-
- `cbrkit.loaders
|
|
40
|
-
- `cbrkit.
|
|
41
|
-
|
|
140
|
+
- `cbrkit.loaders`: Functions for loading cases and queries from various file formats and data sources.
|
|
141
|
+
- `cbrkit.dumpers`: Functions for exporting data to JSON, YAML, CSV, TOML, and Markdown.
|
|
142
|
+
- `cbrkit.sim`: Similarity measures for common data types with utility functions such as `cache`, `combine`, `transpose`, etc.
|
|
143
|
+
- `cbrkit.sim.strings`: String similarity measures (Levenshtein, Jaro, spaCy, etc.).
|
|
42
144
|
- `cbrkit.sim.numbers`: Numeric similarity measures (linear, exponential, threshold).
|
|
43
|
-
- `cbrkit.sim.collections`: Similarity measures for collections and sequences (Jaccard,
|
|
145
|
+
- `cbrkit.sim.collections`: Similarity measures for collections and sequences (Jaccard, etc.).
|
|
44
146
|
- `cbrkit.sim.embed`: Embedding-based similarity functions with caching support.
|
|
45
|
-
- `cbrkit.sim.graphs`: Graph similarity algorithms including
|
|
46
|
-
- `cbrkit.sim.taxonomy`: Taxonomy-based similarity functions.
|
|
147
|
+
- `cbrkit.sim.graphs`: Graph similarity algorithms including A\*, VF2, greedy, LAP, and more.
|
|
148
|
+
- `cbrkit.sim.taxonomy`: Taxonomy-based similarity functions (Wu-Palmer, etc.).
|
|
47
149
|
- `cbrkit.sim.generic`: Generic similarity functions (equality, tables, static).
|
|
48
150
|
- `cbrkit.sim.attribute_value`: Similarity for attribute-value based data.
|
|
49
151
|
- `cbrkit.sim.pooling`: Functions for aggregating multiple similarity values.
|
|
50
152
|
- `cbrkit.sim.aggregator`: Combines multiple local measures into global scores.
|
|
51
|
-
- `cbrkit.
|
|
52
|
-
- `cbrkit.
|
|
53
|
-
- `cbrkit.reuse`:
|
|
153
|
+
- `cbrkit.adapt`: Adaptation functions for adapting cases based on a query.
|
|
154
|
+
- `cbrkit.retrieval`: Retrieval pipelines with BM25, embedding-based retrieval, re-ranking (Cohere, Voyage AI, Sentence Transformers), and more.
|
|
155
|
+
- `cbrkit.reuse`: Reuse pipelines that apply adaptation and score the results.
|
|
156
|
+
- `cbrkit.revise`: Revision pipelines for assessing and optionally repairing solutions.
|
|
157
|
+
- `cbrkit.retain`: Retention pipelines for storing solved cases back into the casebase.
|
|
158
|
+
- `cbrkit.cycle`: Full CBR cycle orchestration across all four phases.
|
|
159
|
+
- `cbrkit.system`: High-level `System` class for composing all CBR phases into a single object.
|
|
160
|
+
- `cbrkit.synthesis`: LLM-based synthesis for generating insights from cases (RAG), with providers for OpenAI, Anthropic, Cohere, Google, Ollama, and more.
|
|
54
161
|
- `cbrkit.eval`: Evaluation metrics for retrieval results including precision, recall, and custom metrics.
|
|
55
|
-
- `cbrkit.model`: Data models for
|
|
56
|
-
- `cbrkit.cycle`: CBR cycle implementation.
|
|
162
|
+
- `cbrkit.model`: Data models for results and graph structures.
|
|
57
163
|
- `cbrkit.typing`: Generic type definitions for defining custom functions.
|
|
58
|
-
- `cbrkit.synthesis`: Functions for working on a casebase with LLMs to create new insights, e.g., in a RAG context.
|
|
59
164
|
|
|
60
165
|
## Installation
|
|
61
166
|
|
|
@@ -74,14 +179,12 @@ pip install cbrkit[EXTRA_NAME,...]
|
|
|
74
179
|
where `EXTRA_NAME` is one of the following:
|
|
75
180
|
|
|
76
181
|
- `all`: All optional dependencies
|
|
77
|
-
- `
|
|
78
|
-
- `
|
|
79
|
-
-
|
|
80
|
-
- `graphs
|
|
81
|
-
- `
|
|
82
|
-
- `
|
|
83
|
-
- `timeseries`: Time series similarity measures like `dtw` and `smith_waterman`
|
|
84
|
-
- `transformers`: Advanced NLP tools based on `pytorch` and `transformers`
|
|
182
|
+
- **LLM providers:** `anthropic`, `cohere`, `google`, `ollama`, `openai`, `openai-agents`, `pydantic-ai`, `instructor`, `voyageai`
|
|
183
|
+
- **NLP / text processing:** `bm25`, `chunking`, `levenshtein`, `nltk`, `spacy`
|
|
184
|
+
- **ML / embeddings:** `transformers` (includes `pytorch` and `sentence-transformers`)
|
|
185
|
+
- **Graphs:** `graphs` (`networkx` and `rustworkx`), `graphviz`
|
|
186
|
+
- **Data backends:** `chromadb`, `lancedb`, `pandas`, `sql` (SQLAlchemy), `zvec`
|
|
187
|
+
- **Tools:** `cli` (CLI), `api` (REST API server), `eval` (evaluation metrics), `timeseries` (DTW, Smith-Waterman)
|
|
85
188
|
|
|
86
189
|
Alternatively, you can also clone this git repository and install CBRKit and its dependencies via uv: `uv sync --all-extras`
|
|
87
190
|
|
|
@@ -95,7 +198,8 @@ We provide predefined functions for the following formats:
|
|
|
95
198
|
- toml
|
|
96
199
|
- xml
|
|
97
200
|
- yaml
|
|
98
|
-
-
|
|
201
|
+
- txt (plain text)
|
|
202
|
+
- py (object inside of a python file)
|
|
99
203
|
|
|
100
204
|
Loading one of those formats can be done via the `file` function:
|
|
101
205
|
|
|
@@ -104,8 +208,18 @@ import cbrkit
|
|
|
104
208
|
casebase = cbrkit.loaders.file("path/to/cases.[json,toml,yaml,xml,csv]")
|
|
105
209
|
```
|
|
106
210
|
|
|
107
|
-
|
|
108
|
-
|
|
211
|
+
You can also load all files from a directory or use the unified `path` function:
|
|
212
|
+
|
|
213
|
+
```python
|
|
214
|
+
# Load all files matching a glob pattern from a directory
|
|
215
|
+
casebase = cbrkit.loaders.directory("path/to/cases/", pattern="*.json")
|
|
216
|
+
|
|
217
|
+
# Unified path function: auto-detects whether path is a file or directory
|
|
218
|
+
casebase = cbrkit.loaders.path("path/to/cases.json") # single file
|
|
219
|
+
casebase = cbrkit.loaders.path("path/to/cases/") # directory
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
Additionally, CBRkit integrates with `polars` and `pandas` for loading data frames:
|
|
109
223
|
|
|
110
224
|
```python
|
|
111
225
|
import polars as pl
|
|
@@ -115,6 +229,25 @@ df = pl.read_csv("path/to/cases.csv")
|
|
|
115
229
|
casebase = cbrkit.loaders.polars(df)
|
|
116
230
|
```
|
|
117
231
|
|
|
232
|
+
For database access, CBRkit provides `sqlite` and `sqlalchemy` loaders (the latter requires the `sql` extra):
|
|
233
|
+
|
|
234
|
+
```python
|
|
235
|
+
casebase = cbrkit.loaders.sqlite("path/to/database.db", "SELECT * FROM cases")
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
**Tip:** You can validate a loaded casebase against a Pydantic model using `cbrkit.loaders.validate()`:
|
|
239
|
+
|
|
240
|
+
```python
|
|
241
|
+
from pydantic import BaseModel
|
|
242
|
+
|
|
243
|
+
class Car(BaseModel):
|
|
244
|
+
price: int
|
|
245
|
+
year: int
|
|
246
|
+
model: str
|
|
247
|
+
|
|
248
|
+
casebase = cbrkit.loaders.validate(casebase, Car)
|
|
249
|
+
```
|
|
250
|
+
|
|
118
251
|
## Defining Queries
|
|
119
252
|
|
|
120
253
|
CBRkit expects the type of the queries to match the type of the cases.
|
|
@@ -139,6 +272,29 @@ In case your query collection only contains a single entry, you can use the `sin
|
|
|
139
272
|
query = cbrkit.helpers.singleton(queries)
|
|
140
273
|
```
|
|
141
274
|
|
|
275
|
+
## Exporting Data
|
|
276
|
+
|
|
277
|
+
CBRkit provides functions for exporting data through the `cbrkit.dumpers` module.
|
|
278
|
+
Supported formats include JSON, YAML, CSV, TOML, and Markdown.
|
|
279
|
+
|
|
280
|
+
```python
|
|
281
|
+
import cbrkit
|
|
282
|
+
|
|
283
|
+
# Export to a file (format is inferred from the extension)
|
|
284
|
+
cbrkit.dumpers.file("output.json", data)
|
|
285
|
+
cbrkit.dumpers.file("output.yaml", data)
|
|
286
|
+
|
|
287
|
+
# Export to a directory (one file per entry)
|
|
288
|
+
cbrkit.dumpers.directory("output/", data)
|
|
289
|
+
|
|
290
|
+
# Or use the unified path function
|
|
291
|
+
cbrkit.dumpers.path("output.json", data) # writes a single file
|
|
292
|
+
cbrkit.dumpers.path("output/", data) # writes a directory
|
|
293
|
+
|
|
294
|
+
# Format data as a Markdown code block
|
|
295
|
+
md = cbrkit.dumpers.markdown()(data)
|
|
296
|
+
```
|
|
297
|
+
|
|
142
298
|
## Similarity Measures and Aggregation
|
|
143
299
|
|
|
144
300
|
The next step is to define similarity measures for the cases and queries.
|
|
@@ -229,6 +385,21 @@ cached_sim = cbrkit.sim.embed.build(
|
|
|
229
385
|
)
|
|
230
386
|
```
|
|
231
387
|
|
|
388
|
+
#### Collection and Sequence Similarity
|
|
389
|
+
|
|
390
|
+
CBRkit provides similarity measures for collections and sequences in `cbrkit.sim.collections`:
|
|
391
|
+
|
|
392
|
+
```python
|
|
393
|
+
# Jaccard similarity for sets (requires the `nltk` extra)
|
|
394
|
+
jaccard_sim = cbrkit.sim.collections.jaccard()
|
|
395
|
+
|
|
396
|
+
# Optimal sequence mapping using A* search
|
|
397
|
+
seq_sim = cbrkit.sim.collections.mapping(cbrkit.sim.generic.equality())
|
|
398
|
+
```
|
|
399
|
+
|
|
400
|
+
Dynamic Time Warping and Smith-Waterman alignment are available with the `timeseries` extra.
|
|
401
|
+
See the [module documentation](https://wi2trier.github.io/cbrkit/cbrkit/sim/collections.html) for the full list.
|
|
402
|
+
|
|
232
403
|
#### Taxonomy-Based Similarity
|
|
233
404
|
|
|
234
405
|
```python
|
|
@@ -269,20 +440,15 @@ CBRkit provides extensive support for graph similarity through various algorithm
|
|
|
269
440
|
|
|
270
441
|
```python
|
|
271
442
|
# Using Graph Edit Distance (GED) with A* search
|
|
272
|
-
graph_sim = cbrkit.sim.graphs.astar(
|
|
273
|
-
|
|
443
|
+
graph_sim = cbrkit.sim.graphs.astar.build(
|
|
444
|
+
node_sim_func=cbrkit.sim.generic.equality(),
|
|
274
445
|
node_matcher=lambda n1, n2: n1 == n2,
|
|
275
|
-
edge_matcher=lambda e1, e2: e1 == e2
|
|
446
|
+
edge_matcher=lambda e1, e2: e1 == e2,
|
|
276
447
|
)
|
|
277
448
|
```
|
|
278
449
|
|
|
279
|
-
Available graph algorithms include
|
|
280
|
-
|
|
281
|
-
- `vf2`: VF2 algorithm for (sub)graph isomorphism
|
|
282
|
-
- `lap`: Linear assignment problem solver
|
|
283
|
-
- `greedy`: Fast greedy matching
|
|
284
|
-
- `brute_force`: Exhaustive search for small graphs
|
|
285
|
-
- `dfs`: Depth-first search based matching
|
|
450
|
+
Available graph algorithms include `astar`, `vf2`, `greedy`, `lap`, `brute_force`, `dfs`, `dtw`, and `smith_waterman`.
|
|
451
|
+
See the [module documentation](https://wi2trier.github.io/cbrkit/cbrkit/sim/graphs.html) for a full list of algorithms and their parameters.
|
|
286
452
|
|
|
287
453
|
### Global Similarity and Aggregation
|
|
288
454
|
|
|
@@ -333,9 +499,30 @@ cbrkit.sim.attribute_value(
|
|
|
333
499
|
)
|
|
334
500
|
```
|
|
335
501
|
|
|
502
|
+
## CBR Cycle Phases
|
|
503
|
+
|
|
504
|
+
All four phases of the CBR cycle — retrieval, reuse, revise, and retain — follow the same unified protocol `CbrFunc` (defined in `cbrkit.typing`).
|
|
505
|
+
Each phase function takes a casebase and a query, and returns an updated casebase together with a score map.
|
|
506
|
+
The casebase in the output may differ from the input depending on the phase (e.g., adapted cases in reuse, newly stored cases in retain).
|
|
507
|
+
The score map assigns a floating-point score to each case in the output casebase, with phase-specific semantics:
|
|
508
|
+
|
|
509
|
+
- **Retrieval**: Similarity scores between cases and the query.
|
|
510
|
+
- **Reuse**: Quality scores of adapted cases compared to the query.
|
|
511
|
+
- **Revise**: Assessment scores evaluating solution correctness.
|
|
512
|
+
- **Retain**: Fitness scores for retained cases.
|
|
513
|
+
|
|
514
|
+
This uniform interface makes it easy to compose phases into pipelines and to swap implementations.
|
|
515
|
+
The phase-specific type aliases `RetrieverFunc`, `ReuserFunc`, `ReviserFunc`, and `RetainerFunc` are provided for clarity but are structurally identical to `CbrFunc`.
|
|
516
|
+
|
|
517
|
+
Each phase result has the following attributes:
|
|
518
|
+
|
|
519
|
+
- `similarities`: A dictionary containing the scores for each case.
|
|
520
|
+
- `ranking`: A list of case keys sorted by their score.
|
|
521
|
+
- `casebase`: The casebase containing the output cases.
|
|
522
|
+
|
|
336
523
|
## Retrieval
|
|
337
524
|
|
|
338
|
-
The
|
|
525
|
+
The first phase is to retrieve cases based on the loaded queries.
|
|
339
526
|
The `cbrkit.retrieval` module provides utility functions for this purpose.
|
|
340
527
|
You first build a retrieval pipeline by specifying a global similarity function and optionally a limit for the number of retrieved cases.
|
|
341
528
|
|
|
@@ -439,50 +626,177 @@ An overview of all available adaptation functions can be found in the [module do
|
|
|
439
626
|
|
|
440
627
|
## Reuse
|
|
441
628
|
|
|
442
|
-
The reuse phase applies adaptation functions to retrieved cases
|
|
629
|
+
The reuse phase applies adaptation functions to retrieved cases and scores the adapted results.
|
|
630
|
+
The `cbrkit.reuse` module provides utility functions for this purpose.
|
|
631
|
+
You build a reuse pipeline by specifying an adaptation function and a similarity function:
|
|
443
632
|
|
|
444
633
|
```python
|
|
445
634
|
reuser = cbrkit.reuse.build(
|
|
446
|
-
cbrkit.adapt.attribute_value(...),
|
|
635
|
+
adaptation_func=cbrkit.adapt.attribute_value(...),
|
|
636
|
+
similarity_func=cbrkit.sim.attribute_value(...),
|
|
447
637
|
)
|
|
448
638
|
```
|
|
449
639
|
|
|
450
|
-
This reuser can then be applied to
|
|
640
|
+
This reuser can then be applied to a retrieval result to adapt cases based on a query:
|
|
451
641
|
|
|
452
642
|
```python
|
|
453
|
-
result = cbrkit.reuse.
|
|
643
|
+
result = cbrkit.reuse.apply_result(retrieval_result, reuser)
|
|
454
644
|
```
|
|
455
645
|
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
- `adaptations`: A dictionary containing the adapted values for each case.
|
|
459
|
-
- `ranking`: A list of case indices matching the retrieval result.
|
|
460
|
-
- `casebase`: The casebase containing only the adapted cases.
|
|
646
|
+
As with all CBR phases, the result contains `similarities` (quality scores of adapted cases), `ranking`, and `casebase` (containing the adapted cases).
|
|
461
647
|
|
|
462
648
|
Multiple reuse pipelines can be combined by passing them as a list or tuple:
|
|
463
649
|
|
|
464
650
|
```python
|
|
465
651
|
reuser1 = cbrkit.reuse.build(...)
|
|
466
652
|
reuser2 = cbrkit.reuse.build(...)
|
|
467
|
-
result = cbrkit.reuse.
|
|
653
|
+
result = cbrkit.reuse.apply_result(retrieval_result, (reuser1, reuser2))
|
|
468
654
|
```
|
|
469
655
|
|
|
470
656
|
The result structure follows the same pattern as the retrieval results with `final_step` and `steps` attributes.
|
|
471
657
|
|
|
658
|
+
## Revise
|
|
659
|
+
|
|
660
|
+
The revise phase assesses the quality of solutions produced by the reuse phase and optionally repairs them.
|
|
661
|
+
The `cbrkit.revise` module provides utility functions for this purpose.
|
|
662
|
+
You build a revise pipeline by specifying an assessment function and an optional repair function:
|
|
663
|
+
|
|
664
|
+
```python
|
|
665
|
+
reviser = cbrkit.revise.build(
|
|
666
|
+
assess_func=cbrkit.sim.attribute_value(...),
|
|
667
|
+
repair_func=some_adaptation_func, # optional
|
|
668
|
+
)
|
|
669
|
+
```
|
|
670
|
+
|
|
671
|
+
The reviser can be applied to a reuse result:
|
|
672
|
+
|
|
673
|
+
```python
|
|
674
|
+
result = cbrkit.revise.apply_result(reuse_result, reviser)
|
|
675
|
+
```
|
|
676
|
+
|
|
677
|
+
When a `repair_func` is provided, solutions are repaired before assessment.
|
|
678
|
+
The result contains `similarities` with quality assessment scores for each case.
|
|
679
|
+
|
|
680
|
+
## Retain
|
|
681
|
+
|
|
682
|
+
The retain phase decides whether and how to integrate new cases into the casebase.
|
|
683
|
+
The `cbrkit.retain` module provides utility functions for this purpose.
|
|
684
|
+
You build a retain pipeline by specifying an assessment function and a storage function:
|
|
685
|
+
|
|
686
|
+
```python
|
|
687
|
+
retainer = cbrkit.retain.build(
|
|
688
|
+
assess_func=cbrkit.sim.generic.equality(),
|
|
689
|
+
storage_func=cbrkit.retain.static(
|
|
690
|
+
key_func=lambda keys: max(keys, default=-1) + 1,
|
|
691
|
+
casebase=casebase,
|
|
692
|
+
),
|
|
693
|
+
)
|
|
694
|
+
```
|
|
695
|
+
|
|
696
|
+
CBRkit provides several built-in storage functions:
|
|
697
|
+
|
|
698
|
+
- `static`: Generates keys from a fixed reference casebase to avoid collisions.
|
|
699
|
+
- `indexable`: Keeps an `IndexableFunc`'s index in sync with the casebase.
|
|
700
|
+
|
|
701
|
+
You can filter retained cases based on their assessment scores using the `dropout` wrapper:
|
|
702
|
+
|
|
703
|
+
```python
|
|
704
|
+
retainer = cbrkit.retain.dropout(
|
|
705
|
+
retainer_func=cbrkit.retain.build(...),
|
|
706
|
+
min_similarity=0.5,
|
|
707
|
+
)
|
|
708
|
+
```
|
|
709
|
+
|
|
710
|
+
The retainer can be applied to a revise result:
|
|
711
|
+
|
|
712
|
+
```python
|
|
713
|
+
result = cbrkit.retain.apply_result(revise_result, retainer)
|
|
714
|
+
```
|
|
715
|
+
|
|
716
|
+
The result contains `similarities` with fitness scores and `casebase` with the updated cases.
|
|
717
|
+
|
|
718
|
+
## Full CBR Cycle
|
|
719
|
+
|
|
720
|
+
The `cbrkit.cycle` module orchestrates all four phases (retrieval, reuse, revise, retain) in a single call.
|
|
721
|
+
This is useful when you want to run the complete CBR cycle without manually chaining the phases.
|
|
722
|
+
|
|
723
|
+
```python
|
|
724
|
+
result = cbrkit.cycle.apply_query(
|
|
725
|
+
casebase,
|
|
726
|
+
query,
|
|
727
|
+
retrievers=retriever,
|
|
728
|
+
reusers=reuser,
|
|
729
|
+
revisers=reviser,
|
|
730
|
+
retainers=retainer,
|
|
731
|
+
)
|
|
732
|
+
# Access results from each phase
|
|
733
|
+
retrieval_result = result.retrieval
|
|
734
|
+
reuse_result = result.reuse
|
|
735
|
+
revise_result = result.revise
|
|
736
|
+
retain_result = result.retain
|
|
737
|
+
```
|
|
738
|
+
|
|
739
|
+
For multiple queries, use `cbrkit.cycle.apply_queries` or `cbrkit.cycle.apply_batches`.
|
|
740
|
+
|
|
741
|
+
## System
|
|
742
|
+
|
|
743
|
+
The `cbrkit.system.System` class provides a high-level interface for composing all CBR phases into a single reusable object.
|
|
744
|
+
It is especially useful for integrating CBRkit into applications where the casebase and phase functions are configured once and reused across multiple queries.
|
|
745
|
+
|
|
746
|
+
```python
|
|
747
|
+
system = cbrkit.system.System(
|
|
748
|
+
casebase=casebase,
|
|
749
|
+
retriever_factory=lambda config: retriever,
|
|
750
|
+
reuser_factory=lambda config: reuser,
|
|
751
|
+
)
|
|
752
|
+
|
|
753
|
+
# Run individual phases
|
|
754
|
+
retrieval_result = system.retrieve(query)
|
|
755
|
+
reuse_result = system.reuse(query)
|
|
756
|
+
|
|
757
|
+
# Run the full cycle
|
|
758
|
+
cycle_result = system.cycle(query)
|
|
759
|
+
```
|
|
760
|
+
|
|
761
|
+
The `System` class supports optional configuration parameters for each phase factory, allowing you to customize the behavior per query.
|
|
762
|
+
|
|
472
763
|
## Advanced Retrieval
|
|
473
764
|
|
|
474
765
|
### BM25 Retrieval
|
|
475
766
|
|
|
476
|
-
CBRkit includes a BM25 retriever for text-based retrieval
|
|
767
|
+
CBRkit includes a BM25 retriever for sparse text-based retrieval (requires the `bm25` extra).
|
|
768
|
+
The BM25 retriever delegates text tokenization to a `cbrkit.sim.embed.bm25` embedding function:
|
|
477
769
|
|
|
478
770
|
```python
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
771
|
+
bm25_func = cbrkit.sim.embed.bm25(language="en")
|
|
772
|
+
retriever = cbrkit.retrieval.dropout(
|
|
773
|
+
cbrkit.retrieval.bm25(conversion_func=bm25_func),
|
|
774
|
+
limit=10,
|
|
775
|
+
)
|
|
776
|
+
result = cbrkit.retrieval.apply_query(casebase, query, retriever)
|
|
777
|
+
```
|
|
778
|
+
|
|
779
|
+
### Embedding-Based Retrieval
|
|
780
|
+
|
|
781
|
+
CBRkit supports embedding-based retrieval through vector similarity search.
|
|
782
|
+
The `embed` retriever uses an embedding function with caching and a vector scorer:
|
|
783
|
+
|
|
784
|
+
```python
|
|
785
|
+
embed_func = cbrkit.sim.embed.cache(
|
|
786
|
+
func=cbrkit.sim.embed.sentence_transformers(model="all-MiniLM-L6-v2"),
|
|
787
|
+
path="embeddings.sqlite3",
|
|
788
|
+
table="strf/minilm",
|
|
789
|
+
)
|
|
790
|
+
retriever = cbrkit.retrieval.dropout(
|
|
791
|
+
cbrkit.retrieval.embed(conversion_func=embed_func),
|
|
792
|
+
limit=10,
|
|
482
793
|
)
|
|
483
794
|
result = cbrkit.retrieval.apply_query(casebase, query, retriever)
|
|
484
795
|
```
|
|
485
796
|
|
|
797
|
+
For persistent storage backends, CBRkit also supports `lancedb`, `chromadb`, and `zvec` retrievers (each requires its respective extra).
|
|
798
|
+
These backends manage index persistence and support hybrid search modes.
|
|
799
|
+
|
|
486
800
|
### Combining Multiple Retrievers
|
|
487
801
|
|
|
488
802
|
The `combine` function allows merging results from multiple retrievers:
|
|
@@ -492,23 +806,78 @@ retriever1 = cbrkit.retrieval.build(...)
|
|
|
492
806
|
retriever2 = cbrkit.retrieval.bm25(...)
|
|
493
807
|
|
|
494
808
|
combined = cbrkit.retrieval.combine(
|
|
495
|
-
|
|
496
|
-
aggregator=cbrkit.sim.aggregator(pooling="mean")
|
|
809
|
+
retriever_funcs=[retriever1, retriever2],
|
|
810
|
+
aggregator=cbrkit.sim.aggregator(pooling="mean"),
|
|
497
811
|
)
|
|
498
812
|
result = cbrkit.retrieval.apply_query(casebase, query, combined)
|
|
499
813
|
```
|
|
500
814
|
|
|
501
815
|
### Distributed Processing
|
|
502
816
|
|
|
503
|
-
|
|
817
|
+
`build` and `distribute` offer two different levels of parallelism.
|
|
818
|
+
`build(sim_func, multiprocessing=True)` parallelizes the **similarity computations** within batches: all (casebase, query) pairs are flattened into individual comparisons and distributed across processes.
|
|
819
|
+
`distribute(retriever, multiprocessing=True)` parallelizes across **batches**: each (casebase, query) pair is passed to the wrapped retriever as a separate process.
|
|
820
|
+
Use `distribute` when you have many independent queries and want to process them in parallel as separate retrieval tasks:
|
|
504
821
|
|
|
505
822
|
```python
|
|
506
823
|
retriever = cbrkit.retrieval.distribute(
|
|
507
824
|
cbrkit.retrieval.build(...),
|
|
508
|
-
|
|
825
|
+
multiprocessing=True, # or an integer for a specific number of processes
|
|
509
826
|
)
|
|
510
827
|
```
|
|
511
828
|
|
|
829
|
+
### Re-ranking
|
|
830
|
+
|
|
831
|
+
CBRkit supports re-ranking retrieved results using external models.
|
|
832
|
+
Re-rankers take the initial retrieval results and reorder them based on a more expensive model.
|
|
833
|
+
The following re-rankers are available (each requires its respective extra):
|
|
834
|
+
|
|
835
|
+
- `cbrkit.retrieval.cohere`: Cohere re-ranking (extra: `cohere`)
|
|
836
|
+
- `cbrkit.retrieval.voyageai`: Voyage AI re-ranking (extra: `voyageai`)
|
|
837
|
+
- `cbrkit.retrieval.sentence_transformers`: Sentence Transformers cross-encoder re-ranking (extra: `transformers`)
|
|
838
|
+
|
|
839
|
+
```python
|
|
840
|
+
reranker = cbrkit.retrieval.cohere(model="rerank-v3.5")
|
|
841
|
+
|
|
842
|
+
# Use as a second-stage retriever in a sequential pipeline
|
|
843
|
+
retriever = cbrkit.retrieval.build(cbrkit.sim.attribute_value(...))
|
|
844
|
+
result = cbrkit.retrieval.apply_query(casebase, query, (retriever, reranker))
|
|
845
|
+
```
|
|
846
|
+
|
|
847
|
+
### Indexed Retrieval
|
|
848
|
+
|
|
849
|
+
Some retrievers like `bm25`, `embed`, and `lancedb` support **indexed retrieval**, where the casebase is pre-indexed once and then queried without passing the full casebase each time.
|
|
850
|
+
This is useful for large casebases or when using external search backends.
|
|
851
|
+
|
|
852
|
+
To use indexed retrieval, first create a retriever and call its `index()` method:
|
|
853
|
+
|
|
854
|
+
```python
|
|
855
|
+
from frozendict import frozendict
|
|
856
|
+
|
|
857
|
+
bm25_func = cbrkit.sim.embed.bm25(language="en")
|
|
858
|
+
retriever = cbrkit.retrieval.bm25(conversion_func=bm25_func)
|
|
859
|
+
retriever.create_index(frozendict(casebase))
|
|
860
|
+
```
|
|
861
|
+
|
|
862
|
+
Then pass an empty casebase (`{}`) to signal that the retriever should use its pre-indexed data:
|
|
863
|
+
|
|
864
|
+
```python
|
|
865
|
+
result = cbrkit.retrieval.apply_query({}, query, retriever)
|
|
866
|
+
```
|
|
867
|
+
|
|
868
|
+
As a convenience, CBRkit provides `apply_query_indexed` and `apply_queries_indexed` which handle the empty casebase automatically:
|
|
869
|
+
|
|
870
|
+
```python
|
|
871
|
+
result = cbrkit.retrieval.apply_query_indexed(query, retriever)
|
|
872
|
+
# or for multiple queries:
|
|
873
|
+
result = cbrkit.retrieval.apply_queries_indexed(queries, retriever)
|
|
874
|
+
```
|
|
875
|
+
|
|
876
|
+
If a retriever receives an empty casebase but has not been indexed yet, a `ValueError` is raised with a message to call `index()` first.
|
|
877
|
+
|
|
878
|
+
The `System` class also supports indexed retrieval by defaulting the casebase to an empty dict.
|
|
879
|
+
This allows creating a system where all retrievers are pre-indexed and no casebase needs to be provided at query time.
|
|
880
|
+
|
|
512
881
|
## Evaluation
|
|
513
882
|
|
|
514
883
|
CBRkit provides evaluation tools through the `cbrkit.eval` module for assessing the quality of retrieval results.
|
|
@@ -559,20 +928,26 @@ All of them can be computed at different cutoff points by appending `@k`, e.g.,
|
|
|
559
928
|
We also offer a function to automatically generate a list of metrics for different cutoff points:
|
|
560
929
|
|
|
561
930
|
```python
|
|
562
|
-
metrics = cbrkit.eval.
|
|
931
|
+
metrics = cbrkit.eval.generate_metrics(["precision", "recall", "f1"], ks=[1, 5, 10])
|
|
563
932
|
```
|
|
564
933
|
|
|
565
934
|
## Synthesis
|
|
566
935
|
|
|
567
936
|
In the context of CBRkit, synthesis refers to creating new insights from the cases which were retrieved in a previous retrieval step, for example in a RAG context. CBRkit builds a synthesizer using the function `cbrkit.synthesis.build` with a `provider` and a `prompt`. A synthesizer maps a `Result` (obtained in the retrieval step) to an LLM output (can be a string or structurized). An example can be found in [examples/cars_rag.py](https://github.com/wi2trier/cbrkit/blob/main/examples/cars_rag.py).
|
|
568
937
|
|
|
569
|
-
The following **providers** are
|
|
938
|
+
The following **providers** are available in `cbrkit.synthesis.providers` (each requires its respective extra):
|
|
570
939
|
|
|
571
|
-
-
|
|
572
|
-
-
|
|
573
|
-
-
|
|
574
|
-
-
|
|
575
|
-
-
|
|
940
|
+
- `openai` / `openai_completions`: OpenAI Completions API (`OPENAI_API_KEY`)
|
|
941
|
+
- `openai_responses`: OpenAI Responses API (`OPENAI_API_KEY`)
|
|
942
|
+
- `openai_agents`: OpenAI Agents framework (`OPENAI_API_KEY`)
|
|
943
|
+
- `anthropic`: Anthropic Claude API (`ANTHROPIC_API_KEY`)
|
|
944
|
+
- `cohere`: Cohere API (`CO_API_KEY`)
|
|
945
|
+
- `google`: Google Generative AI (`GOOGLE_API_KEY`)
|
|
946
|
+
- `ollama`: Ollama (local, no API key needed)
|
|
947
|
+
- `pydantic_ai`: Pydantic AI framework
|
|
948
|
+
- `instructor`: Instructor for structured output
|
|
949
|
+
|
|
950
|
+
Providers can be chained using `cbrkit.synthesis.providers.pipe()` and managed as conversations using `cbrkit.synthesis.providers.conversation()`.
|
|
576
951
|
|
|
577
952
|
The respective provider class in `cbrkit.synthesis.providers` has to be initialized with the model name and a response type (either `str` or a [Pydantic model](https://docs.pydantic.dev/latest/concepts/models/) for structured output). Further model options like `temperature`, `seed`, `max_tokens`, etc. can also be specified here.
|
|
578
953
|
|
|
@@ -603,16 +978,15 @@ CBRKit's `transpose` prompt allows to transpose cases and queries before they ar
|
|
|
603
978
|
|
|
604
979
|
```python
|
|
605
980
|
from cbrkit.typing import JsonEntry
|
|
606
|
-
from cbrkit.dumpers import json_markdown
|
|
607
981
|
|
|
608
982
|
def encoder(value) -> dict:
|
|
609
983
|
...
|
|
610
984
|
baseprompt = cbrkit.synthesis.prompts.default(instructions, encoder=encoder)
|
|
611
985
|
# transform the entries, e.g., by shortening, leaving out irrelevant attributes, etc.
|
|
612
|
-
# In this case, the value of every field is
|
|
986
|
+
# In this case, the value of every field is truncated to 100 characters
|
|
613
987
|
def shorten(entry: dict) -> JsonEntry:
|
|
614
|
-
entry = {k: str(v)[:100] for k,v in entry.items()}
|
|
615
|
-
return
|
|
988
|
+
entry = {k: str(v)[:100] for k, v in entry.items()}
|
|
989
|
+
return cbrkit.dumpers.markdown()(entry)
|
|
616
990
|
|
|
617
991
|
prompt = cbrkit.synthesis.prompts.transpose(baseprompt, shorten)
|
|
618
992
|
synthesizer = cbrkit.synthesis.build(provider, prompt)
|
|
@@ -650,6 +1024,60 @@ response = get_result(batches)
|
|
|
650
1024
|
|
|
651
1025
|
The complete version of this example can be found under `examples/cars_rag_large.py`.
|
|
652
1026
|
|
|
1027
|
+
## Tips and Common Patterns
|
|
1028
|
+
|
|
1029
|
+
### Parameter Naming Conventions
|
|
1030
|
+
|
|
1031
|
+
CBRkit inspects function signatures to determine their behavior:
|
|
1032
|
+
|
|
1033
|
+
- **Similarity functions** must use `x` (case) and `y` (query) as parameter names.
|
|
1034
|
+
- **Adaptation functions** must use `case` and `query` for pair functions, or `casebase` and `query` for map/reduce functions.
|
|
1035
|
+
- **Batch functions** accept a list of tuples instead of individual pairs: `f([(x1, y1), (x2, y2), ...])`.
|
|
1036
|
+
|
|
1037
|
+
### Filtering with `dropout`
|
|
1038
|
+
|
|
1039
|
+
The `dropout` wrapper is the standard way to add limits and thresholds to any retriever or retainer.
|
|
1040
|
+
It supports `limit` (maximum number of results), `min_similarity`, and `max_similarity`:
|
|
1041
|
+
|
|
1042
|
+
```python
|
|
1043
|
+
retriever = cbrkit.retrieval.dropout(
|
|
1044
|
+
cbrkit.retrieval.build(sim_func),
|
|
1045
|
+
limit=10,
|
|
1046
|
+
min_similarity=0.3,
|
|
1047
|
+
)
|
|
1048
|
+
```
|
|
1049
|
+
|
|
1050
|
+
### Composing Multiple Phase Functions
|
|
1051
|
+
|
|
1052
|
+
All CBR phases support sequential composition by passing a tuple of phase functions.
|
|
1053
|
+
Each step receives the output casebase of the previous step, enabling patterns like MAC/FAC:
|
|
1054
|
+
|
|
1055
|
+
```python
|
|
1056
|
+
result = cbrkit.retrieval.apply_query(casebase, query, (cheap_retriever, expensive_retriever))
|
|
1057
|
+
```
|
|
1058
|
+
|
|
1059
|
+
### Using `frozendict` for Immutable Casebases
|
|
1060
|
+
|
|
1061
|
+
Several components (e.g., indexed retrieval, retain phase) benefit from immutable casebases.
|
|
1062
|
+
Use `frozendict` to prevent accidental mutations:
|
|
1063
|
+
|
|
1064
|
+
```python
|
|
1065
|
+
from frozendict import frozendict
|
|
1066
|
+
casebase = frozendict(cbrkit.loaders.file("cases.json"))
|
|
1067
|
+
```
|
|
1068
|
+
|
|
1069
|
+
### Multiprocessing Support
|
|
1070
|
+
|
|
1071
|
+
The `cbrkit.retrieval.build` function supports multiprocessing to parallelize similarity computations within batches:
|
|
1072
|
+
|
|
1073
|
+
```python
|
|
1074
|
+
retriever = cbrkit.retrieval.build(sim_func, multiprocessing=True)
|
|
1075
|
+
# or with a specific number of processes:
|
|
1076
|
+
retriever = cbrkit.retrieval.build(sim_func, multiprocessing=4)
|
|
1077
|
+
```
|
|
1078
|
+
|
|
1079
|
+
To parallelize across batches instead, see [Distributed Processing](#distributed-processing).
|
|
1080
|
+
|
|
653
1081
|
## Logging
|
|
654
1082
|
|
|
655
1083
|
CBRkit integrates with the `logging` module to provide a unified logging interface.
|