cbrkit 0.26.4__tar.gz → 0.27.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cbrkit-0.26.4 → cbrkit-0.27.0}/PKG-INFO +175 -13
- {cbrkit-0.26.4 → cbrkit-0.27.0}/README.md +172 -12
- {cbrkit-0.26.4 → cbrkit-0.27.0}/pyproject.toml +2 -1
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/eval/common.py +1 -1
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/eval/retrieval.py +2 -4
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/helpers.py +26 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/retrieval/__init__.py +4 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/retrieval/build.py +45 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/__init__.py +4 -2
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/aggregator.py +3 -34
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/graphs/astar.py +8 -44
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/graphs/common.py +42 -5
- cbrkit-0.27.0/src/cbrkit/sim/pooling.py +100 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/strings.py +80 -25
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/typing.py +40 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/__init__.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/__main__.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/adapt/__init__.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/adapt/attribute_value.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/adapt/generic.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/adapt/numbers.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/adapt/strings.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/api.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/cli.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/constants.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/cycle.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/dumpers.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/eval/__init__.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/loaders.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/model/__init__.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/model/graph.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/model/result.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/py.typed +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/retrieval/apply.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/retrieval/rerank.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/reuse/__init__.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/reuse/apply.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/reuse/build.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/attribute_value.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/collections.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/embed.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/generic.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/graphs/__init__.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/graphs/alignment.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/graphs/brute_force.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/graphs/dfs.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/graphs/greedy.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/graphs/lap.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/graphs/precompute.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/graphs/qap.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/graphs/vf2.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/numbers.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/taxonomy.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/wrappers.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/synthesis/__init__.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/synthesis/apply.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/synthesis/build.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/synthesis/model.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/synthesis/prompts.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/synthesis/providers/__init__.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/synthesis/providers/anthropic.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/synthesis/providers/cohere.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/synthesis/providers/google.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/synthesis/providers/instructor.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/synthesis/providers/model.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/synthesis/providers/ollama.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/synthesis/providers/openai.py +0 -0
- {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/synthesis/providers/wrappers.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: cbrkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.27.0
|
|
4
4
|
Summary: Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI
|
|
5
5
|
Keywords: cbr,case-based reasoning,api,similarity,nlp,retrieval,cli,tool,library
|
|
6
6
|
Author: Mirko Lenz
|
|
@@ -39,6 +39,7 @@ Requires-Dist: fastapi>=0.100,<1 ; extra == 'api'
|
|
|
39
39
|
Requires-Dist: pydantic-settings>=2,<3 ; extra == 'api'
|
|
40
40
|
Requires-Dist: python-multipart>=0.0.15,<1 ; extra == 'api'
|
|
41
41
|
Requires-Dist: uvicorn[standard]>=0.30,<1 ; extra == 'api'
|
|
42
|
+
Requires-Dist: chonkie>=1,<2 ; extra == 'chunking'
|
|
42
43
|
Requires-Dist: typer>=0.9,<1 ; extra == 'cli'
|
|
43
44
|
Requires-Dist: ranx>=0.3,<1 ; extra == 'eval'
|
|
44
45
|
Requires-Dist: networkx>=3,<4 ; extra == 'graphs'
|
|
@@ -70,6 +71,7 @@ Project-URL: Issues, https://github.com/wi2trier/cbrkit/issues
|
|
|
70
71
|
Project-URL: Repository, https://github.com/wi2trier/cbrkit
|
|
71
72
|
Provides-Extra: all
|
|
72
73
|
Provides-Extra: api
|
|
74
|
+
Provides-Extra: chunking
|
|
73
75
|
Provides-Extra: cli
|
|
74
76
|
Provides-Extra: eval
|
|
75
77
|
Provides-Extra: graphs
|
|
@@ -119,12 +121,25 @@ Further examples can be found in our [tests](./tests/test_retrieve.py) and [docu
|
|
|
119
121
|
The following modules are part of CBRkit:
|
|
120
122
|
|
|
121
123
|
- `cbrkit.loaders` and `cbrkit.dumpers`: Functions for loading and exporting cases and queries.
|
|
122
|
-
- `cbrkit.sim`: Similarity
|
|
123
|
-
- `cbrkit.
|
|
124
|
+
- `cbrkit.sim`: Similarity functions for common data types and some utility functions such as `cache`, `combine`, `transpose`, etc.
|
|
125
|
+
- `cbrkit.sim.strings`: String similarity measures (Levenshtein, Jaro, semantic, etc.).
|
|
126
|
+
- `cbrkit.sim.numbers`: Numeric similarity measures (linear, exponential, threshold).
|
|
127
|
+
- `cbrkit.sim.collections`: Similarity measures for collections and sequences (Jaccard, DTW, Smith-Waterman).
|
|
128
|
+
- `cbrkit.sim.embed`: Embedding-based similarity functions with caching support.
|
|
129
|
+
- `cbrkit.sim.graphs`: Graph similarity algorithms including GED, A*, VF2, and more.
|
|
130
|
+
- `cbrkit.sim.taxonomy`: Taxonomy-based similarity functions.
|
|
131
|
+
- `cbrkit.sim.generic`: Generic similarity functions (equality, tables, static).
|
|
132
|
+
- `cbrkit.sim.attribute_value`: Similarity for attribute-value based data.
|
|
133
|
+
- `cbrkit.sim.pooling`: Functions for aggregating multiple similarity values.
|
|
134
|
+
- `cbrkit.sim.aggregator`: Combines multiple local measures into global scores.
|
|
135
|
+
- `cbrkit.retrieval`: Functions for defining and applying retrieval pipelines, includes BM25 retrieval, rerankers, etc.
|
|
124
136
|
- `cbrkit.adapt`: Adaptation generator functions for adapting cases based on a query.
|
|
125
137
|
- `cbrkit.reuse`: Functions for defining and applying reuse pipelines.
|
|
138
|
+
- `cbrkit.eval`: Evaluation metrics for retrieval results including precision, recall, and custom metrics.
|
|
139
|
+
- `cbrkit.model`: Data models for graphs and results.
|
|
140
|
+
- `cbrkit.cycle`: CBR cycle implementation.
|
|
126
141
|
- `cbrkit.typing`: Generic type definitions for defining custom functions.
|
|
127
|
-
- `cbrkit.synthesis`: Functions for working on a casebase with LLMs to create new insights, e.g
|
|
142
|
+
- `cbrkit.synthesis`: Functions for working on a casebase with LLMs to create new insights, e.g., in a RAG context.
|
|
128
143
|
|
|
129
144
|
## Installation
|
|
130
145
|
|
|
@@ -235,18 +250,125 @@ You need to make sure that the two parameters are named `x` and `y`, otherwise C
|
|
|
235
250
|
|
|
236
251
|
### Built-in Similarity Measures
|
|
237
252
|
|
|
238
|
-
CBRkit
|
|
253
|
+
CBRkit contains a comprehensive selection of built-in similarity measures for various data types in the module `cbrkit.sim`.
|
|
239
254
|
They are provided through **generator functions** that allow you to customize the behavior of the built-in measures.
|
|
240
|
-
|
|
255
|
+
|
|
256
|
+
#### String Similarity
|
|
257
|
+
|
|
258
|
+
```python
|
|
259
|
+
# Semantic similarity is covered by the `cbrkit.sim.embed` module.
|
|
260
|
+
# See below for details.
|
|
261
|
+
|
|
262
|
+
# Edit distance measures
|
|
263
|
+
levenshtein_sim = cbrkit.sim.strings.levenshtein()
|
|
264
|
+
jaro_sim = cbrkit.sim.strings.jaro()
|
|
265
|
+
|
|
266
|
+
# Exact matching
|
|
267
|
+
equality_sim = cbrkit.sim.generic.equality()
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
#### Number Similarity
|
|
271
|
+
|
|
272
|
+
```python
|
|
273
|
+
# Linear similarity with optional thresholds
|
|
274
|
+
linear_sim = cbrkit.sim.numbers.linear(max_distance=100)
|
|
275
|
+
|
|
276
|
+
# Exponential decay similarity
|
|
277
|
+
exp_sim = cbrkit.sim.numbers.exponential(alpha=0.1)
|
|
278
|
+
|
|
279
|
+
# Step functions
|
|
280
|
+
threshold_sim = cbrkit.sim.numbers.threshold(threshold=50)
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
#### Embedding-Based Similarity
|
|
284
|
+
|
|
285
|
+
```python
|
|
286
|
+
# Build a similarity function with embedding and scorer
|
|
287
|
+
embed_sim = cbrkit.sim.embed.build(
|
|
288
|
+
conversion_func=cbrkit.sim.embed.sentence_transformers(
|
|
289
|
+
model="all-MiniLM-L6-v2"
|
|
290
|
+
),
|
|
291
|
+
sim_func=cbrkit.sim.embed.cosine() # or dot(), angular(), euclidean(), manhattan()
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
# Using OpenAI embeddings
|
|
295
|
+
openai_sim = cbrkit.sim.embed.build(
|
|
296
|
+
conversion_func=cbrkit.sim.embed.openai(
|
|
297
|
+
model="text-embedding-3-small"
|
|
298
|
+
),
|
|
299
|
+
sim_func=cbrkit.sim.embed.cosine()
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
# Caching embeddings for performance
|
|
303
|
+
cached_embed_func = cbrkit.sim.embed.cache(
|
|
304
|
+
func=cbrkit.sim.embed.sentence_transformers(
|
|
305
|
+
model="all-MiniLM-L6-v2"
|
|
306
|
+
),
|
|
307
|
+
path="embeddings_cache.npz",
|
|
308
|
+
autodump=True,
|
|
309
|
+
autoload=True
|
|
310
|
+
)
|
|
311
|
+
cached_sim = cbrkit.sim.embed.build(
|
|
312
|
+
conversion_func=cached_embed_func,
|
|
313
|
+
sim_func=cbrkit.sim.embed.cosine()
|
|
314
|
+
)
|
|
315
|
+
```
|
|
316
|
+
|
|
317
|
+
#### Taxonomy-Based Similarity
|
|
241
318
|
|
|
242
319
|
```python
|
|
243
|
-
|
|
320
|
+
# Load taxonomy from file
|
|
321
|
+
taxonomy_sim = cbrkit.sim.taxonomy.build(
|
|
322
|
+
path="taxonomy.yaml",
|
|
323
|
+
measure=cbrkit.sim.taxonomy.wu_palmer(),
|
|
324
|
+
)
|
|
244
325
|
```
|
|
245
326
|
|
|
246
|
-
|
|
327
|
+
#### Utility Functions
|
|
328
|
+
|
|
329
|
+
```python
|
|
330
|
+
# Combining multiple similarity functions
|
|
331
|
+
combined_sim = cbrkit.sim.combine(
|
|
332
|
+
sim_funcs=[sim1, sim2, sim3],
|
|
333
|
+
aggregator=cbrkit.sim.aggregator(pooling="mean")
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
# Caching similarity results
|
|
337
|
+
cached_sim = cbrkit.sim.cache(base_sim_func)
|
|
338
|
+
|
|
339
|
+
# Transposing similarity functions
|
|
340
|
+
transposed_sim = cbrkit.sim.transpose(
|
|
341
|
+
sim_func=number_sim,
|
|
342
|
+
to_x=lambda s: float(s),
|
|
343
|
+
to_y=lambda s: float(s)
|
|
344
|
+
)
|
|
345
|
+
```
|
|
346
|
+
|
|
347
|
+
**Please note:** Calling these functions returns a similarity function itself that has the signature `sim = f(x, y)`.
|
|
247
348
|
|
|
248
349
|
An overview of all available similarity measures can be found in the [module documentation](https://wi2trier.github.io/cbrkit/cbrkit/sim.html).
|
|
249
350
|
|
|
351
|
+
### Graph Similarity
|
|
352
|
+
|
|
353
|
+
CBRkit provides extensive support for graph similarity through various algorithms:
|
|
354
|
+
|
|
355
|
+
```python
|
|
356
|
+
# Using Graph Edit Distance (GED) with A* search
|
|
357
|
+
graph_sim = cbrkit.sim.graphs.astar(
|
|
358
|
+
node_sim=cbrkit.sim.generic.equality(),
|
|
359
|
+
node_matcher=lambda n1, n2: n1 == n2,
|
|
360
|
+
edge_matcher=lambda e1, e2: e1 == e2
|
|
361
|
+
)
|
|
362
|
+
```
|
|
363
|
+
|
|
364
|
+
Available graph algorithms include:
|
|
365
|
+
- `astar`: A* search for optimal graph edit distance
|
|
366
|
+
- `vf2`: VF2 algorithm for (sub)graph isomorphism
|
|
367
|
+
- `lap`: Linear assignment problem solver
|
|
368
|
+
- `greedy`: Fast greedy matching
|
|
369
|
+
- `brute_force`: Exhaustive search for small graphs
|
|
370
|
+
- `dfs`: Depth-first search based matching
|
|
371
|
+
|
|
250
372
|
### Global Similarity and Aggregation
|
|
251
373
|
|
|
252
374
|
When dealing with cases that are not represented through elementary data types like strings, we need to aggregate individual measures to obtain a global similarity score.
|
|
@@ -377,9 +499,8 @@ They are provided through **generator functions** that allow you to customize th
|
|
|
377
499
|
For example, a number aggregator can be obtained as follows:
|
|
378
500
|
|
|
379
501
|
```python
|
|
380
|
-
# pooling
|
|
381
|
-
|
|
382
|
-
number_adapter = cbrkit.adapt.numbers.aggregate(pooling)
|
|
502
|
+
# pooling can be a string like "mean", "min", "max", "sum", etc. or a custom PoolingFunction
|
|
503
|
+
number_adapter = cbrkit.adapt.numbers.aggregate(pooling="mean")
|
|
383
504
|
```
|
|
384
505
|
|
|
385
506
|
**Please note:** Calling the function `cbrkit.adapt.numbers.aggregate` returns an adaptation function that takes a collection of values and returns an adapted value.
|
|
@@ -433,6 +554,46 @@ result = cbrkit.reuse.apply_query(retrieval_result, query, (reuser1, reuser2))
|
|
|
433
554
|
|
|
434
555
|
The result structure follows the same pattern as the retrieval results with `final_step` and `steps` attributes.
|
|
435
556
|
|
|
557
|
+
## Advanced Retrieval
|
|
558
|
+
|
|
559
|
+
### BM25 Retrieval
|
|
560
|
+
|
|
561
|
+
CBRkit includes a BM25 retriever for text-based retrieval:
|
|
562
|
+
|
|
563
|
+
```python
|
|
564
|
+
retriever = cbrkit.retrieval.bm25(
|
|
565
|
+
key="text_field", # Field to search in
|
|
566
|
+
limit=10
|
|
567
|
+
)
|
|
568
|
+
result = cbrkit.retrieval.apply_query(casebase, query, retriever)
|
|
569
|
+
```
|
|
570
|
+
|
|
571
|
+
### Combining Multiple Retrievers
|
|
572
|
+
|
|
573
|
+
The `combine` function allows merging results from multiple retrievers:
|
|
574
|
+
|
|
575
|
+
```python
|
|
576
|
+
retriever1 = cbrkit.retrieval.build(...)
|
|
577
|
+
retriever2 = cbrkit.retrieval.bm25(...)
|
|
578
|
+
|
|
579
|
+
combined = cbrkit.retrieval.combine(
|
|
580
|
+
retrievers=[retriever1, retriever2],
|
|
581
|
+
aggregator=cbrkit.sim.aggregator(pooling="mean")
|
|
582
|
+
)
|
|
583
|
+
result = cbrkit.retrieval.apply_query(casebase, query, combined)
|
|
584
|
+
```
|
|
585
|
+
|
|
586
|
+
### Distributed Processing
|
|
587
|
+
|
|
588
|
+
For large-scale retrieval, use the `distribute` wrapper:
|
|
589
|
+
|
|
590
|
+
```python
|
|
591
|
+
retriever = cbrkit.retrieval.distribute(
|
|
592
|
+
cbrkit.retrieval.build(...),
|
|
593
|
+
batch_size=1000
|
|
594
|
+
)
|
|
595
|
+
```
|
|
596
|
+
|
|
436
597
|
## Evaluation
|
|
437
598
|
|
|
438
599
|
CBRkit provides evaluation tools through the `cbrkit.eval` module for assessing the quality of retrieval results.
|
|
@@ -518,7 +679,8 @@ response = cbrkit.synthesis.apply_result(retrieval, synthesizer).response
|
|
|
518
679
|
|
|
519
680
|
### Working with large casebases
|
|
520
681
|
|
|
521
|
-
Because the built-in `default` and `document_aware` prompt functions include the entire casebase as context, the LLM input can be quite long when working with a large casebase.
|
|
682
|
+
Because the built-in `default` and `document_aware` prompt functions include the entire casebase as context, the LLM input can be quite long when working with a large casebase.
|
|
683
|
+
Because of this, in this case, we recommend transposing the cases (e.g., truncate every case to a fixed length) and/or apply chunking.
|
|
522
684
|
|
|
523
685
|
#### Transposing cases
|
|
524
686
|
|
|
@@ -531,7 +693,7 @@ from cbrkit.dumpers import json_markdown
|
|
|
531
693
|
def encoder(value) -> dict:
|
|
532
694
|
...
|
|
533
695
|
baseprompt = cbrkit.synthesis.prompts.default(instructions, encoder=encoder)
|
|
534
|
-
# transform the entries, e.g
|
|
696
|
+
# transform the entries, e.g., by shortening, leaving out irrelevant attributes, etc.
|
|
535
697
|
# In this case, the value of every field is trunctated to 100 characters
|
|
536
698
|
def shorten(entry: dict) -> JsonEntry:
|
|
537
699
|
entry = {k: str(v)[:100] for k,v in entry.items()}
|
|
@@ -37,12 +37,25 @@ Further examples can be found in our [tests](./tests/test_retrieve.py) and [docu
|
|
|
37
37
|
The following modules are part of CBRkit:
|
|
38
38
|
|
|
39
39
|
- `cbrkit.loaders` and `cbrkit.dumpers`: Functions for loading and exporting cases and queries.
|
|
40
|
-
- `cbrkit.sim`: Similarity
|
|
41
|
-
- `cbrkit.
|
|
40
|
+
- `cbrkit.sim`: Similarity functions for common data types and some utility functions such as `cache`, `combine`, `transpose`, etc.
|
|
41
|
+
- `cbrkit.sim.strings`: String similarity measures (Levenshtein, Jaro, semantic, etc.).
|
|
42
|
+
- `cbrkit.sim.numbers`: Numeric similarity measures (linear, exponential, threshold).
|
|
43
|
+
- `cbrkit.sim.collections`: Similarity measures for collections and sequences (Jaccard, DTW, Smith-Waterman).
|
|
44
|
+
- `cbrkit.sim.embed`: Embedding-based similarity functions with caching support.
|
|
45
|
+
- `cbrkit.sim.graphs`: Graph similarity algorithms including GED, A*, VF2, and more.
|
|
46
|
+
- `cbrkit.sim.taxonomy`: Taxonomy-based similarity functions.
|
|
47
|
+
- `cbrkit.sim.generic`: Generic similarity functions (equality, tables, static).
|
|
48
|
+
- `cbrkit.sim.attribute_value`: Similarity for attribute-value based data.
|
|
49
|
+
- `cbrkit.sim.pooling`: Functions for aggregating multiple similarity values.
|
|
50
|
+
- `cbrkit.sim.aggregator`: Combines multiple local measures into global scores.
|
|
51
|
+
- `cbrkit.retrieval`: Functions for defining and applying retrieval pipelines, includes BM25 retrieval, rerankers, etc.
|
|
42
52
|
- `cbrkit.adapt`: Adaptation generator functions for adapting cases based on a query.
|
|
43
53
|
- `cbrkit.reuse`: Functions for defining and applying reuse pipelines.
|
|
54
|
+
- `cbrkit.eval`: Evaluation metrics for retrieval results including precision, recall, and custom metrics.
|
|
55
|
+
- `cbrkit.model`: Data models for graphs and results.
|
|
56
|
+
- `cbrkit.cycle`: CBR cycle implementation.
|
|
44
57
|
- `cbrkit.typing`: Generic type definitions for defining custom functions.
|
|
45
|
-
- `cbrkit.synthesis`: Functions for working on a casebase with LLMs to create new insights, e.g
|
|
58
|
+
- `cbrkit.synthesis`: Functions for working on a casebase with LLMs to create new insights, e.g., in a RAG context.
|
|
46
59
|
|
|
47
60
|
## Installation
|
|
48
61
|
|
|
@@ -153,18 +166,125 @@ You need to make sure that the two parameters are named `x` and `y`, otherwise C
|
|
|
153
166
|
|
|
154
167
|
### Built-in Similarity Measures
|
|
155
168
|
|
|
156
|
-
CBRkit
|
|
169
|
+
CBRkit contains a comprehensive selection of built-in similarity measures for various data types in the module `cbrkit.sim`.
|
|
157
170
|
They are provided through **generator functions** that allow you to customize the behavior of the built-in measures.
|
|
158
|
-
|
|
171
|
+
|
|
172
|
+
#### String Similarity
|
|
173
|
+
|
|
174
|
+
```python
|
|
175
|
+
# Semantic similarity is covered by the `cbrkit.sim.embed` module.
|
|
176
|
+
# See below for details.
|
|
177
|
+
|
|
178
|
+
# Edit distance measures
|
|
179
|
+
levenshtein_sim = cbrkit.sim.strings.levenshtein()
|
|
180
|
+
jaro_sim = cbrkit.sim.strings.jaro()
|
|
181
|
+
|
|
182
|
+
# Exact matching
|
|
183
|
+
equality_sim = cbrkit.sim.generic.equality()
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
#### Number Similarity
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
# Linear similarity with optional thresholds
|
|
190
|
+
linear_sim = cbrkit.sim.numbers.linear(max_distance=100)
|
|
191
|
+
|
|
192
|
+
# Exponential decay similarity
|
|
193
|
+
exp_sim = cbrkit.sim.numbers.exponential(alpha=0.1)
|
|
194
|
+
|
|
195
|
+
# Step functions
|
|
196
|
+
threshold_sim = cbrkit.sim.numbers.threshold(threshold=50)
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
#### Embedding-Based Similarity
|
|
200
|
+
|
|
201
|
+
```python
|
|
202
|
+
# Build a similarity function with embedding and scorer
|
|
203
|
+
embed_sim = cbrkit.sim.embed.build(
|
|
204
|
+
conversion_func=cbrkit.sim.embed.sentence_transformers(
|
|
205
|
+
model="all-MiniLM-L6-v2"
|
|
206
|
+
),
|
|
207
|
+
sim_func=cbrkit.sim.embed.cosine() # or dot(), angular(), euclidean(), manhattan()
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
# Using OpenAI embeddings
|
|
211
|
+
openai_sim = cbrkit.sim.embed.build(
|
|
212
|
+
conversion_func=cbrkit.sim.embed.openai(
|
|
213
|
+
model="text-embedding-3-small"
|
|
214
|
+
),
|
|
215
|
+
sim_func=cbrkit.sim.embed.cosine()
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
# Caching embeddings for performance
|
|
219
|
+
cached_embed_func = cbrkit.sim.embed.cache(
|
|
220
|
+
func=cbrkit.sim.embed.sentence_transformers(
|
|
221
|
+
model="all-MiniLM-L6-v2"
|
|
222
|
+
),
|
|
223
|
+
path="embeddings_cache.npz",
|
|
224
|
+
autodump=True,
|
|
225
|
+
autoload=True
|
|
226
|
+
)
|
|
227
|
+
cached_sim = cbrkit.sim.embed.build(
|
|
228
|
+
conversion_func=cached_embed_func,
|
|
229
|
+
sim_func=cbrkit.sim.embed.cosine()
|
|
230
|
+
)
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
#### Taxonomy-Based Similarity
|
|
159
234
|
|
|
160
235
|
```python
|
|
161
|
-
|
|
236
|
+
# Load taxonomy from file
|
|
237
|
+
taxonomy_sim = cbrkit.sim.taxonomy.build(
|
|
238
|
+
path="taxonomy.yaml",
|
|
239
|
+
measure=cbrkit.sim.taxonomy.wu_palmer(),
|
|
240
|
+
)
|
|
162
241
|
```
|
|
163
242
|
|
|
164
|
-
|
|
243
|
+
#### Utility Functions
|
|
244
|
+
|
|
245
|
+
```python
|
|
246
|
+
# Combining multiple similarity functions
|
|
247
|
+
combined_sim = cbrkit.sim.combine(
|
|
248
|
+
sim_funcs=[sim1, sim2, sim3],
|
|
249
|
+
aggregator=cbrkit.sim.aggregator(pooling="mean")
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
# Caching similarity results
|
|
253
|
+
cached_sim = cbrkit.sim.cache(base_sim_func)
|
|
254
|
+
|
|
255
|
+
# Transposing similarity functions
|
|
256
|
+
transposed_sim = cbrkit.sim.transpose(
|
|
257
|
+
sim_func=number_sim,
|
|
258
|
+
to_x=lambda s: float(s),
|
|
259
|
+
to_y=lambda s: float(s)
|
|
260
|
+
)
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
**Please note:** Calling these functions returns a similarity function itself that has the signature `sim = f(x, y)`.
|
|
165
264
|
|
|
166
265
|
An overview of all available similarity measures can be found in the [module documentation](https://wi2trier.github.io/cbrkit/cbrkit/sim.html).
|
|
167
266
|
|
|
267
|
+
### Graph Similarity
|
|
268
|
+
|
|
269
|
+
CBRkit provides extensive support for graph similarity through various algorithms:
|
|
270
|
+
|
|
271
|
+
```python
|
|
272
|
+
# Using Graph Edit Distance (GED) with A* search
|
|
273
|
+
graph_sim = cbrkit.sim.graphs.astar(
|
|
274
|
+
node_sim=cbrkit.sim.generic.equality(),
|
|
275
|
+
node_matcher=lambda n1, n2: n1 == n2,
|
|
276
|
+
edge_matcher=lambda e1, e2: e1 == e2
|
|
277
|
+
)
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
Available graph algorithms include:
|
|
281
|
+
- `astar`: A* search for optimal graph edit distance
|
|
282
|
+
- `vf2`: VF2 algorithm for (sub)graph isomorphism
|
|
283
|
+
- `lap`: Linear assignment problem solver
|
|
284
|
+
- `greedy`: Fast greedy matching
|
|
285
|
+
- `brute_force`: Exhaustive search for small graphs
|
|
286
|
+
- `dfs`: Depth-first search based matching
|
|
287
|
+
|
|
168
288
|
### Global Similarity and Aggregation
|
|
169
289
|
|
|
170
290
|
When dealing with cases that are not represented through elementary data types like strings, we need to aggregate individual measures to obtain a global similarity score.
|
|
@@ -295,9 +415,8 @@ They are provided through **generator functions** that allow you to customize th
|
|
|
295
415
|
For example, a number aggregator can be obtained as follows:
|
|
296
416
|
|
|
297
417
|
```python
|
|
298
|
-
# pooling
|
|
299
|
-
|
|
300
|
-
number_adapter = cbrkit.adapt.numbers.aggregate(pooling)
|
|
418
|
+
# pooling can be a string like "mean", "min", "max", "sum", etc. or a custom PoolingFunction
|
|
419
|
+
number_adapter = cbrkit.adapt.numbers.aggregate(pooling="mean")
|
|
301
420
|
```
|
|
302
421
|
|
|
303
422
|
**Please note:** Calling the function `cbrkit.adapt.numbers.aggregate` returns an adaptation function that takes a collection of values and returns an adapted value.
|
|
@@ -351,6 +470,46 @@ result = cbrkit.reuse.apply_query(retrieval_result, query, (reuser1, reuser2))
|
|
|
351
470
|
|
|
352
471
|
The result structure follows the same pattern as the retrieval results with `final_step` and `steps` attributes.
|
|
353
472
|
|
|
473
|
+
## Advanced Retrieval
|
|
474
|
+
|
|
475
|
+
### BM25 Retrieval
|
|
476
|
+
|
|
477
|
+
CBRkit includes a BM25 retriever for text-based retrieval:
|
|
478
|
+
|
|
479
|
+
```python
|
|
480
|
+
retriever = cbrkit.retrieval.bm25(
|
|
481
|
+
key="text_field", # Field to search in
|
|
482
|
+
limit=10
|
|
483
|
+
)
|
|
484
|
+
result = cbrkit.retrieval.apply_query(casebase, query, retriever)
|
|
485
|
+
```
|
|
486
|
+
|
|
487
|
+
### Combining Multiple Retrievers
|
|
488
|
+
|
|
489
|
+
The `combine` function allows merging results from multiple retrievers:
|
|
490
|
+
|
|
491
|
+
```python
|
|
492
|
+
retriever1 = cbrkit.retrieval.build(...)
|
|
493
|
+
retriever2 = cbrkit.retrieval.bm25(...)
|
|
494
|
+
|
|
495
|
+
combined = cbrkit.retrieval.combine(
|
|
496
|
+
retrievers=[retriever1, retriever2],
|
|
497
|
+
aggregator=cbrkit.sim.aggregator(pooling="mean")
|
|
498
|
+
)
|
|
499
|
+
result = cbrkit.retrieval.apply_query(casebase, query, combined)
|
|
500
|
+
```
|
|
501
|
+
|
|
502
|
+
### Distributed Processing
|
|
503
|
+
|
|
504
|
+
For large-scale retrieval, use the `distribute` wrapper:
|
|
505
|
+
|
|
506
|
+
```python
|
|
507
|
+
retriever = cbrkit.retrieval.distribute(
|
|
508
|
+
cbrkit.retrieval.build(...),
|
|
509
|
+
batch_size=1000
|
|
510
|
+
)
|
|
511
|
+
```
|
|
512
|
+
|
|
354
513
|
## Evaluation
|
|
355
514
|
|
|
356
515
|
CBRkit provides evaluation tools through the `cbrkit.eval` module for assessing the quality of retrieval results.
|
|
@@ -436,7 +595,8 @@ response = cbrkit.synthesis.apply_result(retrieval, synthesizer).response
|
|
|
436
595
|
|
|
437
596
|
### Working with large casebases
|
|
438
597
|
|
|
439
|
-
Because the built-in `default` and `document_aware` prompt functions include the entire casebase as context, the LLM input can be quite long when working with a large casebase.
|
|
598
|
+
Because the built-in `default` and `document_aware` prompt functions include the entire casebase as context, the LLM input can be quite long when working with a large casebase.
|
|
599
|
+
Because of this, in this case, we recommend transposing the cases (e.g., truncate every case to a fixed length) and/or apply chunking.
|
|
440
600
|
|
|
441
601
|
#### Transposing cases
|
|
442
602
|
|
|
@@ -449,7 +609,7 @@ from cbrkit.dumpers import json_markdown
|
|
|
449
609
|
def encoder(value) -> dict:
|
|
450
610
|
...
|
|
451
611
|
baseprompt = cbrkit.synthesis.prompts.default(instructions, encoder=encoder)
|
|
452
|
-
# transform the entries, e.g
|
|
612
|
+
# transform the entries, e.g., by shortening, leaving out irrelevant attributes, etc.
|
|
453
613
|
# In this case, the value of every field is trunctated to 100 characters
|
|
454
614
|
def shorten(entry: dict) -> JsonEntry:
|
|
455
615
|
entry = {k: str(v)[:100] for k,v in entry.items()}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "cbrkit"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.27.0"
|
|
4
4
|
description = "Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI"
|
|
5
5
|
authors = [{ name = "Mirko Lenz", email = "mirko@mirkolenz.com" }]
|
|
6
6
|
readme = "README.md"
|
|
@@ -58,6 +58,7 @@ api = [
|
|
|
58
58
|
"python-multipart>=0.0.15,<1",
|
|
59
59
|
"uvicorn[standard]>=0.30,<1",
|
|
60
60
|
]
|
|
61
|
+
chunking = ["chonkie>=1,<2"]
|
|
61
62
|
cli = ["typer>=0.9,<1"]
|
|
62
63
|
eval = ["ranx>=0.3,<1"]
|
|
63
64
|
graphs = ["networkx>=3,<4", "rustworkx>=0.15,<1"]
|
|
@@ -244,7 +244,7 @@ def kendall_tau(
|
|
|
244
244
|
qrel_relevant = {k for k, v in qrels[key].items() if v >= relevance_level}
|
|
245
245
|
sorted_qrel_relevant = sorted(qrel_relevant, key=lambda x: qrels[key][x])
|
|
246
246
|
|
|
247
|
-
sorted_run = sorted(run.keys(), key=lambda x: run[key][x], reverse=True)
|
|
247
|
+
sorted_run = sorted(run[key].keys(), key=lambda x: run[key][x], reverse=True)
|
|
248
248
|
run_k = sorted_run[: k if k > 0 else len(sorted_run)]
|
|
249
249
|
|
|
250
250
|
max_idx = min(len(run_k), len(sorted_qrel_relevant))
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from collections.abc import Sequence
|
|
2
2
|
from typing import Any, Literal
|
|
3
3
|
|
|
4
|
-
from ..helpers import
|
|
4
|
+
from ..helpers import normalize_and_scale, round, unpack_float
|
|
5
5
|
from ..retrieval import Result, ResultStep
|
|
6
6
|
from ..typing import EvalMetricFunc, Float, QueryCaseMatrix
|
|
7
7
|
from .common import DEFAULT_METRICS, compute
|
|
@@ -65,12 +65,10 @@ def retrieval_step_to_qrels[Q, C, S: Float](
|
|
|
65
65
|
min_sim = 0.0
|
|
66
66
|
max_sim = 1.0
|
|
67
67
|
|
|
68
|
-
qrel_factor = max_qrel - min_qrel
|
|
69
|
-
|
|
70
68
|
return {
|
|
71
69
|
query: {
|
|
72
70
|
case: round(
|
|
73
|
-
|
|
71
|
+
normalize_and_scale(sim, min_sim, max_sim, min_qrel, max_qrel),
|
|
74
72
|
round_mode,
|
|
75
73
|
)
|
|
76
74
|
for case, sim in entry.items()
|
|
@@ -71,6 +71,8 @@ __all__ = [
|
|
|
71
71
|
"load_callables_map",
|
|
72
72
|
"load_callables",
|
|
73
73
|
"load_object",
|
|
74
|
+
"normalize",
|
|
75
|
+
"normalize_and_scale",
|
|
74
76
|
"log_batch",
|
|
75
77
|
"mp_count",
|
|
76
78
|
"mp_map",
|
|
@@ -605,6 +607,30 @@ def scale(value: float, lower: float, upper: float) -> float:
|
|
|
605
607
|
return value * (upper - lower) + lower
|
|
606
608
|
|
|
607
609
|
|
|
610
|
+
def normalize(value: float, value_min: float, value_max: float) -> float:
|
|
611
|
+
"""Normalize a value from [value_min, value_max] to [0, 1]."""
|
|
612
|
+
if value_max == value_min:
|
|
613
|
+
# Handle edge case where all values are identical
|
|
614
|
+
return 0.0
|
|
615
|
+
|
|
616
|
+
return (value - value_min) / (value_max - value_min)
|
|
617
|
+
|
|
618
|
+
|
|
619
|
+
def normalize_and_scale(
|
|
620
|
+
value: float,
|
|
621
|
+
value_min: float,
|
|
622
|
+
value_max: float,
|
|
623
|
+
target_min: float,
|
|
624
|
+
target_max: float,
|
|
625
|
+
) -> float:
|
|
626
|
+
"""Normalize a value from [value_min, value_max] to [target_min, target_max]."""
|
|
627
|
+
# First normalize to [0, 1]
|
|
628
|
+
normalized = normalize(value, value_min, value_max)
|
|
629
|
+
|
|
630
|
+
# Then scale to target range
|
|
631
|
+
return scale(normalized, target_min, target_max)
|
|
632
|
+
|
|
633
|
+
|
|
608
634
|
def load_object(import_name: str) -> Any:
|
|
609
635
|
"""Import an object based on a string.
|
|
610
636
|
|
|
@@ -3,6 +3,9 @@ from ..model import QueryResultStep, Result, ResultStep
|
|
|
3
3
|
from .apply import apply_batches, apply_queries, apply_query
|
|
4
4
|
from .build import build, combine, distribute, dropout, transpose, transpose_value
|
|
5
5
|
|
|
6
|
+
with optional_dependencies():
|
|
7
|
+
from .build import chunk
|
|
8
|
+
|
|
6
9
|
with optional_dependencies():
|
|
7
10
|
from .rerank import cohere
|
|
8
11
|
|
|
@@ -22,6 +25,7 @@ __all__ = [
|
|
|
22
25
|
"dropout",
|
|
23
26
|
"distribute",
|
|
24
27
|
"combine",
|
|
28
|
+
"chunk",
|
|
25
29
|
"apply_batches",
|
|
26
30
|
"apply_queries",
|
|
27
31
|
"apply_query",
|
|
@@ -12,6 +12,7 @@ from ..helpers import (
|
|
|
12
12
|
mp_count,
|
|
13
13
|
mp_map,
|
|
14
14
|
mp_starmap,
|
|
15
|
+
optional_dependencies,
|
|
15
16
|
sim_map2ranking,
|
|
16
17
|
unpack_float,
|
|
17
18
|
use_mp,
|
|
@@ -315,3 +316,47 @@ class build[K, V, S: Float](RetrieverFunc[K, V, S]):
|
|
|
315
316
|
similarities[idx][key] = sim
|
|
316
317
|
|
|
317
318
|
return similarities
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
with optional_dependencies():
|
|
322
|
+
from chonkie import BaseChunker
|
|
323
|
+
|
|
324
|
+
@dataclass(slots=True, frozen=True)
|
|
325
|
+
class chunk[S: Float](RetrieverFunc[str, str, S]):
|
|
326
|
+
"""Chunks string cases using the chonkie library before retrieval.
|
|
327
|
+
|
|
328
|
+
This retriever is special in that it returns a different set of cases for each batch
|
|
329
|
+
it processes, as it splits the original string cases into chunks.
|
|
330
|
+
|
|
331
|
+
Args:
|
|
332
|
+
retriever_func: The retriever function to be used on the chunked strings.
|
|
333
|
+
chunker: A BaseChunker instance from the chonkie library.
|
|
334
|
+
|
|
335
|
+
Returns:
|
|
336
|
+
A retriever function that chunks string cases and retrieves from the chunks.
|
|
337
|
+
"""
|
|
338
|
+
|
|
339
|
+
retriever_func: RetrieverFunc[str, str, S]
|
|
340
|
+
chunker: BaseChunker
|
|
341
|
+
|
|
342
|
+
@override
|
|
343
|
+
def __call__(
|
|
344
|
+
self, batches: Sequence[tuple[Casebase[str, str], str]]
|
|
345
|
+
) -> Sequence[SimMap[str, S]]:
|
|
346
|
+
chunked_batches: list[tuple[Casebase[str, str], str]] = []
|
|
347
|
+
|
|
348
|
+
for casebase, query in batches:
|
|
349
|
+
chunked_casebase: dict[str, str] = {}
|
|
350
|
+
|
|
351
|
+
for case_key, case_text in casebase.items():
|
|
352
|
+
chunks = self.chunker.chunk(case_text)
|
|
353
|
+
|
|
354
|
+
for i, chunk in enumerate(chunks):
|
|
355
|
+
chunk_key = f"{case_key}-chunk{i}"
|
|
356
|
+
chunked_casebase[chunk_key] = (
|
|
357
|
+
chunk if isinstance(chunk, str) else chunk.text
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
chunked_batches.append((chunked_casebase, query))
|
|
361
|
+
|
|
362
|
+
return self.retriever_func(chunked_batches)
|
|
@@ -9,8 +9,9 @@ there is also a measure for attribute-value data.
|
|
|
9
9
|
Additionally, the module contains an aggregator to combine multiple local measures into a global score.
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
|
-
from . import collections, embed, generic, graphs, numbers, strings, taxonomy
|
|
13
|
-
from .aggregator import
|
|
12
|
+
from . import collections, embed, generic, graphs, numbers, pooling, strings, taxonomy
|
|
13
|
+
from .aggregator import aggregator
|
|
14
|
+
from .pooling import PoolingName
|
|
14
15
|
from .attribute_value import AttributeValueSim, attribute_value
|
|
15
16
|
from .wrappers import (
|
|
16
17
|
attribute_table,
|
|
@@ -40,6 +41,7 @@ __all__ = [
|
|
|
40
41
|
"graphs",
|
|
41
42
|
"embed",
|
|
42
43
|
"taxonomy",
|
|
44
|
+
"pooling",
|
|
43
45
|
"aggregator",
|
|
44
46
|
"PoolingName",
|
|
45
47
|
"AttributeValueSim",
|