cbrkit 0.26.4__tar.gz → 0.27.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {cbrkit-0.26.4 → cbrkit-0.27.0}/PKG-INFO +175 -13
  2. {cbrkit-0.26.4 → cbrkit-0.27.0}/README.md +172 -12
  3. {cbrkit-0.26.4 → cbrkit-0.27.0}/pyproject.toml +2 -1
  4. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/eval/common.py +1 -1
  5. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/eval/retrieval.py +2 -4
  6. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/helpers.py +26 -0
  7. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/retrieval/__init__.py +4 -0
  8. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/retrieval/build.py +45 -0
  9. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/__init__.py +4 -2
  10. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/aggregator.py +3 -34
  11. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/graphs/astar.py +8 -44
  12. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/graphs/common.py +42 -5
  13. cbrkit-0.27.0/src/cbrkit/sim/pooling.py +100 -0
  14. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/strings.py +80 -25
  15. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/typing.py +40 -0
  16. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/__init__.py +0 -0
  17. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/__main__.py +0 -0
  18. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/adapt/__init__.py +0 -0
  19. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/adapt/attribute_value.py +0 -0
  20. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/adapt/generic.py +0 -0
  21. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/adapt/numbers.py +0 -0
  22. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/adapt/strings.py +0 -0
  23. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/api.py +0 -0
  24. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/cli.py +0 -0
  25. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/constants.py +0 -0
  26. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/cycle.py +0 -0
  27. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/dumpers.py +0 -0
  28. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/eval/__init__.py +0 -0
  29. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/loaders.py +0 -0
  30. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/model/__init__.py +0 -0
  31. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/model/graph.py +0 -0
  32. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/model/result.py +0 -0
  33. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/py.typed +0 -0
  34. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/retrieval/apply.py +0 -0
  35. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/retrieval/rerank.py +0 -0
  36. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/reuse/__init__.py +0 -0
  37. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/reuse/apply.py +0 -0
  38. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/reuse/build.py +0 -0
  39. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/attribute_value.py +0 -0
  40. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/collections.py +0 -0
  41. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/embed.py +0 -0
  42. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/generic.py +0 -0
  43. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/graphs/__init__.py +0 -0
  44. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/graphs/alignment.py +0 -0
  45. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/graphs/brute_force.py +0 -0
  46. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/graphs/dfs.py +0 -0
  47. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/graphs/greedy.py +0 -0
  48. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/graphs/lap.py +0 -0
  49. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/graphs/precompute.py +0 -0
  50. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/graphs/qap.py +0 -0
  51. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/graphs/vf2.py +0 -0
  52. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/numbers.py +0 -0
  53. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/taxonomy.py +0 -0
  54. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/sim/wrappers.py +0 -0
  55. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/synthesis/__init__.py +0 -0
  56. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/synthesis/apply.py +0 -0
  57. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/synthesis/build.py +0 -0
  58. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/synthesis/model.py +0 -0
  59. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/synthesis/prompts.py +0 -0
  60. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/synthesis/providers/__init__.py +0 -0
  61. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/synthesis/providers/anthropic.py +0 -0
  62. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/synthesis/providers/cohere.py +0 -0
  63. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/synthesis/providers/google.py +0 -0
  64. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/synthesis/providers/instructor.py +0 -0
  65. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/synthesis/providers/model.py +0 -0
  66. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/synthesis/providers/ollama.py +0 -0
  67. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/synthesis/providers/openai.py +0 -0
  68. {cbrkit-0.26.4 → cbrkit-0.27.0}/src/cbrkit/synthesis/providers/wrappers.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: cbrkit
3
- Version: 0.26.4
3
+ Version: 0.27.0
4
4
  Summary: Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI
5
5
  Keywords: cbr,case-based reasoning,api,similarity,nlp,retrieval,cli,tool,library
6
6
  Author: Mirko Lenz
@@ -39,6 +39,7 @@ Requires-Dist: fastapi>=0.100,<1 ; extra == 'api'
39
39
  Requires-Dist: pydantic-settings>=2,<3 ; extra == 'api'
40
40
  Requires-Dist: python-multipart>=0.0.15,<1 ; extra == 'api'
41
41
  Requires-Dist: uvicorn[standard]>=0.30,<1 ; extra == 'api'
42
+ Requires-Dist: chonkie>=1,<2 ; extra == 'chunking'
42
43
  Requires-Dist: typer>=0.9,<1 ; extra == 'cli'
43
44
  Requires-Dist: ranx>=0.3,<1 ; extra == 'eval'
44
45
  Requires-Dist: networkx>=3,<4 ; extra == 'graphs'
@@ -70,6 +71,7 @@ Project-URL: Issues, https://github.com/wi2trier/cbrkit/issues
70
71
  Project-URL: Repository, https://github.com/wi2trier/cbrkit
71
72
  Provides-Extra: all
72
73
  Provides-Extra: api
74
+ Provides-Extra: chunking
73
75
  Provides-Extra: cli
74
76
  Provides-Extra: eval
75
77
  Provides-Extra: graphs
@@ -119,12 +121,25 @@ Further examples can be found in our [tests](./tests/test_retrieve.py) and [docu
119
121
  The following modules are part of CBRkit:
120
122
 
121
123
  - `cbrkit.loaders` and `cbrkit.dumpers`: Functions for loading and exporting cases and queries.
122
- - `cbrkit.sim`: Similarity generator functions for common data types like strings and numbers.
123
- - `cbrkit.retrieval`: Functions for defining and applying retrieval pipelines.
124
+ - `cbrkit.sim`: Similarity functions for common data types and some utility functions such as `cache`, `combine`, `transpose`, etc.
125
+ - `cbrkit.sim.strings`: String similarity measures (Levenshtein, Jaro, semantic, etc.).
126
+ - `cbrkit.sim.numbers`: Numeric similarity measures (linear, exponential, threshold).
127
+ - `cbrkit.sim.collections`: Similarity measures for collections and sequences (Jaccard, DTW, Smith-Waterman).
128
+ - `cbrkit.sim.embed`: Embedding-based similarity functions with caching support.
129
+ - `cbrkit.sim.graphs`: Graph similarity algorithms including GED, A*, VF2, and more.
130
+ - `cbrkit.sim.taxonomy`: Taxonomy-based similarity functions.
131
+ - `cbrkit.sim.generic`: Generic similarity functions (equality, tables, static).
132
+ - `cbrkit.sim.attribute_value`: Similarity for attribute-value based data.
133
+ - `cbrkit.sim.pooling`: Functions for aggregating multiple similarity values.
134
+ - `cbrkit.sim.aggregator`: Combines multiple local measures into global scores.
135
+ - `cbrkit.retrieval`: Functions for defining and applying retrieval pipelines, includes BM25 retrieval, rerankers, etc.
124
136
  - `cbrkit.adapt`: Adaptation generator functions for adapting cases based on a query.
125
137
  - `cbrkit.reuse`: Functions for defining and applying reuse pipelines.
138
+ - `cbrkit.eval`: Evaluation metrics for retrieval results including precision, recall, and custom metrics.
139
+ - `cbrkit.model`: Data models for graphs and results.
140
+ - `cbrkit.cycle`: CBR cycle implementation.
126
141
  - `cbrkit.typing`: Generic type definitions for defining custom functions.
127
- - `cbrkit.synthesis`: Functions for working on a casebase with LLMs to create new insights, e.g. in a RAG context.
142
+ - `cbrkit.synthesis`: Functions for working on a casebase with LLMs to create new insights, e.g., in a RAG context.
128
143
 
129
144
  ## Installation
130
145
 
@@ -235,18 +250,125 @@ You need to make sure that the two parameters are named `x` and `y`, otherwise C
235
250
 
236
251
  ### Built-in Similarity Measures
237
252
 
238
- CBRkit also contains a selection of built-in similarity measures for the most common data types in the module `cbrkit.sim`.
253
+ CBRkit contains a comprehensive selection of built-in similarity measures for various data types in the module `cbrkit.sim`.
239
254
  They are provided through **generator functions** that allow you to customize the behavior of the built-in measures.
240
- For example, an spacy-based embedding similarity measure can be obtained as follows:
255
+
256
+ #### String Similarity
257
+
258
+ ```python
259
+ # Semantic similarity is covered by the `cbrkit.sim.embed` module.
260
+ # See below for details.
261
+
262
+ # Edit distance measures
263
+ levenshtein_sim = cbrkit.sim.strings.levenshtein()
264
+ jaro_sim = cbrkit.sim.strings.jaro()
265
+
266
+ # Exact matching
267
+ equality_sim = cbrkit.sim.generic.equality()
268
+ ```
269
+
270
+ #### Number Similarity
271
+
272
+ ```python
273
+ # Linear similarity with optional thresholds
274
+ linear_sim = cbrkit.sim.numbers.linear(max_distance=100)
275
+
276
+ # Exponential decay similarity
277
+ exp_sim = cbrkit.sim.numbers.exponential(alpha=0.1)
278
+
279
+ # Step functions
280
+ threshold_sim = cbrkit.sim.numbers.threshold(threshold=50)
281
+ ```
282
+
283
+ #### Embedding-Based Similarity
284
+
285
+ ```python
286
+ # Build a similarity function with embedding and scorer
287
+ embed_sim = cbrkit.sim.embed.build(
288
+ conversion_func=cbrkit.sim.embed.sentence_transformers(
289
+ model="all-MiniLM-L6-v2"
290
+ ),
291
+ sim_func=cbrkit.sim.embed.cosine() # or dot(), angular(), euclidean(), manhattan()
292
+ )
293
+
294
+ # Using OpenAI embeddings
295
+ openai_sim = cbrkit.sim.embed.build(
296
+ conversion_func=cbrkit.sim.embed.openai(
297
+ model="text-embedding-3-small"
298
+ ),
299
+ sim_func=cbrkit.sim.embed.cosine()
300
+ )
301
+
302
+ # Caching embeddings for performance
303
+ cached_embed_func = cbrkit.sim.embed.cache(
304
+ func=cbrkit.sim.embed.sentence_transformers(
305
+ model="all-MiniLM-L6-v2"
306
+ ),
307
+ path="embeddings_cache.npz",
308
+ autodump=True,
309
+ autoload=True
310
+ )
311
+ cached_sim = cbrkit.sim.embed.build(
312
+ conversion_func=cached_embed_func,
313
+ sim_func=cbrkit.sim.embed.cosine()
314
+ )
315
+ ```
316
+
317
+ #### Taxonomy-Based Similarity
241
318
 
242
319
  ```python
243
- semantic_similarity = cbrkit.sim.strings.spacy(model="en_core_web_lg")
320
+ # Load taxonomy from file
321
+ taxonomy_sim = cbrkit.sim.taxonomy.build(
322
+ path="taxonomy.yaml",
323
+ measure=cbrkit.sim.taxonomy.wu_palmer(),
324
+ )
244
325
  ```
245
326
 
246
- **Please note:** Calling the function `cbrkit.sim.strings.spacy` returns a similarity function itself that has the same signature as the `color_similarity` function defined above.
327
+ #### Utility Functions
328
+
329
+ ```python
330
+ # Combining multiple similarity functions
331
+ combined_sim = cbrkit.sim.combine(
332
+ sim_funcs=[sim1, sim2, sim3],
333
+ aggregator=cbrkit.sim.aggregator(pooling="mean")
334
+ )
335
+
336
+ # Caching similarity results
337
+ cached_sim = cbrkit.sim.cache(base_sim_func)
338
+
339
+ # Transposing similarity functions
340
+ transposed_sim = cbrkit.sim.transpose(
341
+ sim_func=number_sim,
342
+ to_x=lambda s: float(s),
343
+ to_y=lambda s: float(s)
344
+ )
345
+ ```
346
+
347
+ **Please note:** Calling these functions returns a similarity function itself that has the signature `sim = f(x, y)`.
247
348
 
248
349
  An overview of all available similarity measures can be found in the [module documentation](https://wi2trier.github.io/cbrkit/cbrkit/sim.html).
249
350
 
351
+ ### Graph Similarity
352
+
353
+ CBRkit provides extensive support for graph similarity through various algorithms:
354
+
355
+ ```python
356
+ # Using Graph Edit Distance (GED) with A* search
357
+ graph_sim = cbrkit.sim.graphs.astar(
358
+ node_sim=cbrkit.sim.generic.equality(),
359
+ node_matcher=lambda n1, n2: n1 == n2,
360
+ edge_matcher=lambda e1, e2: e1 == e2
361
+ )
362
+ ```
363
+
364
+ Available graph algorithms include:
365
+ - `astar`: A* search for optimal graph edit distance
366
+ - `vf2`: VF2 algorithm for (sub)graph isomorphism
367
+ - `lap`: Linear assignment problem solver
368
+ - `greedy`: Fast greedy matching
369
+ - `brute_force`: Exhaustive search for small graphs
370
+ - `dfs`: Depth-first search based matching
371
+
250
372
  ### Global Similarity and Aggregation
251
373
 
252
374
  When dealing with cases that are not represented through elementary data types like strings, we need to aggregate individual measures to obtain a global similarity score.
@@ -377,9 +499,8 @@ They are provided through **generator functions** that allow you to customize th
377
499
  For example, a number aggregator can be obtained as follows:
378
500
 
379
501
  ```python
380
- # pooling must be a PoolingFunction or one of the provided PoolingNames
381
- pooling = "mean"
382
- number_adapter = cbrkit.adapt.numbers.aggregate(pooling)
502
+ # pooling can be a string like "mean", "min", "max", "sum", etc. or a custom PoolingFunction
503
+ number_adapter = cbrkit.adapt.numbers.aggregate(pooling="mean")
383
504
  ```
384
505
 
385
506
  **Please note:** Calling the function `cbrkit.adapt.numbers.aggregate` returns an adaptation function that takes a collection of values and returns an adapted value.
@@ -433,6 +554,46 @@ result = cbrkit.reuse.apply_query(retrieval_result, query, (reuser1, reuser2))
433
554
 
434
555
  The result structure follows the same pattern as the retrieval results with `final_step` and `steps` attributes.
435
556
 
557
+ ## Advanced Retrieval
558
+
559
+ ### BM25 Retrieval
560
+
561
+ CBRkit includes a BM25 retriever for text-based retrieval:
562
+
563
+ ```python
564
+ retriever = cbrkit.retrieval.bm25(
565
+ key="text_field", # Field to search in
566
+ limit=10
567
+ )
568
+ result = cbrkit.retrieval.apply_query(casebase, query, retriever)
569
+ ```
570
+
571
+ ### Combining Multiple Retrievers
572
+
573
+ The `combine` function allows merging results from multiple retrievers:
574
+
575
+ ```python
576
+ retriever1 = cbrkit.retrieval.build(...)
577
+ retriever2 = cbrkit.retrieval.bm25(...)
578
+
579
+ combined = cbrkit.retrieval.combine(
580
+ retrievers=[retriever1, retriever2],
581
+ aggregator=cbrkit.sim.aggregator(pooling="mean")
582
+ )
583
+ result = cbrkit.retrieval.apply_query(casebase, query, combined)
584
+ ```
585
+
586
+ ### Distributed Processing
587
+
588
+ For large-scale retrieval, use the `distribute` wrapper:
589
+
590
+ ```python
591
+ retriever = cbrkit.retrieval.distribute(
592
+ cbrkit.retrieval.build(...),
593
+ batch_size=1000
594
+ )
595
+ ```
596
+
436
597
  ## Evaluation
437
598
 
438
599
  CBRkit provides evaluation tools through the `cbrkit.eval` module for assessing the quality of retrieval results.
@@ -518,7 +679,8 @@ response = cbrkit.synthesis.apply_result(retrieval, synthesizer).response
518
679
 
519
680
  ### Working with large casebases
520
681
 
521
- Because the built-in `default` and `document_aware` prompt functions include the entire casebase as context, the LLM input can be quite long when working with a large casebase. Because of this, in this case, we recommend transposing the cases (e.g. truncate every case to a fixed length) and/or apply chunking.
682
+ Because the built-in `default` and `document_aware` prompt functions include the entire casebase as context, the LLM input can be quite long when working with a large casebase.
683
+ Because of this, in this case, we recommend transposing the cases (e.g., truncate every case to a fixed length) and/or apply chunking.
522
684
 
523
685
  #### Transposing cases
524
686
 
@@ -531,7 +693,7 @@ from cbrkit.dumpers import json_markdown
531
693
  def encoder(value) -> dict:
532
694
  ...
533
695
  baseprompt = cbrkit.synthesis.prompts.default(instructions, encoder=encoder)
534
- # transform the entries, e.g. by shortening, leaving out irrelevant attributes, etc.
696
+ # transform the entries, e.g., by shortening, leaving out irrelevant attributes, etc.
535
697
  # In this case, the value of every field is trunctated to 100 characters
536
698
  def shorten(entry: dict) -> JsonEntry:
537
699
  entry = {k: str(v)[:100] for k,v in entry.items()}
@@ -37,12 +37,25 @@ Further examples can be found in our [tests](./tests/test_retrieve.py) and [docu
37
37
  The following modules are part of CBRkit:
38
38
 
39
39
  - `cbrkit.loaders` and `cbrkit.dumpers`: Functions for loading and exporting cases and queries.
40
- - `cbrkit.sim`: Similarity generator functions for common data types like strings and numbers.
41
- - `cbrkit.retrieval`: Functions for defining and applying retrieval pipelines.
40
+ - `cbrkit.sim`: Similarity functions for common data types and some utility functions such as `cache`, `combine`, `transpose`, etc.
41
+ - `cbrkit.sim.strings`: String similarity measures (Levenshtein, Jaro, semantic, etc.).
42
+ - `cbrkit.sim.numbers`: Numeric similarity measures (linear, exponential, threshold).
43
+ - `cbrkit.sim.collections`: Similarity measures for collections and sequences (Jaccard, DTW, Smith-Waterman).
44
+ - `cbrkit.sim.embed`: Embedding-based similarity functions with caching support.
45
+ - `cbrkit.sim.graphs`: Graph similarity algorithms including GED, A*, VF2, and more.
46
+ - `cbrkit.sim.taxonomy`: Taxonomy-based similarity functions.
47
+ - `cbrkit.sim.generic`: Generic similarity functions (equality, tables, static).
48
+ - `cbrkit.sim.attribute_value`: Similarity for attribute-value based data.
49
+ - `cbrkit.sim.pooling`: Functions for aggregating multiple similarity values.
50
+ - `cbrkit.sim.aggregator`: Combines multiple local measures into global scores.
51
+ - `cbrkit.retrieval`: Functions for defining and applying retrieval pipelines, includes BM25 retrieval, rerankers, etc.
42
52
  - `cbrkit.adapt`: Adaptation generator functions for adapting cases based on a query.
43
53
  - `cbrkit.reuse`: Functions for defining and applying reuse pipelines.
54
+ - `cbrkit.eval`: Evaluation metrics for retrieval results including precision, recall, and custom metrics.
55
+ - `cbrkit.model`: Data models for graphs and results.
56
+ - `cbrkit.cycle`: CBR cycle implementation.
44
57
  - `cbrkit.typing`: Generic type definitions for defining custom functions.
45
- - `cbrkit.synthesis`: Functions for working on a casebase with LLMs to create new insights, e.g. in a RAG context.
58
+ - `cbrkit.synthesis`: Functions for working on a casebase with LLMs to create new insights, e.g., in a RAG context.
46
59
 
47
60
  ## Installation
48
61
 
@@ -153,18 +166,125 @@ You need to make sure that the two parameters are named `x` and `y`, otherwise C
153
166
 
154
167
  ### Built-in Similarity Measures
155
168
 
156
- CBRkit also contains a selection of built-in similarity measures for the most common data types in the module `cbrkit.sim`.
169
+ CBRkit contains a comprehensive selection of built-in similarity measures for various data types in the module `cbrkit.sim`.
157
170
  They are provided through **generator functions** that allow you to customize the behavior of the built-in measures.
158
- For example, an spacy-based embedding similarity measure can be obtained as follows:
171
+
172
+ #### String Similarity
173
+
174
+ ```python
175
+ # Semantic similarity is covered by the `cbrkit.sim.embed` module.
176
+ # See below for details.
177
+
178
+ # Edit distance measures
179
+ levenshtein_sim = cbrkit.sim.strings.levenshtein()
180
+ jaro_sim = cbrkit.sim.strings.jaro()
181
+
182
+ # Exact matching
183
+ equality_sim = cbrkit.sim.generic.equality()
184
+ ```
185
+
186
+ #### Number Similarity
187
+
188
+ ```python
189
+ # Linear similarity with optional thresholds
190
+ linear_sim = cbrkit.sim.numbers.linear(max_distance=100)
191
+
192
+ # Exponential decay similarity
193
+ exp_sim = cbrkit.sim.numbers.exponential(alpha=0.1)
194
+
195
+ # Step functions
196
+ threshold_sim = cbrkit.sim.numbers.threshold(threshold=50)
197
+ ```
198
+
199
+ #### Embedding-Based Similarity
200
+
201
+ ```python
202
+ # Build a similarity function with embedding and scorer
203
+ embed_sim = cbrkit.sim.embed.build(
204
+ conversion_func=cbrkit.sim.embed.sentence_transformers(
205
+ model="all-MiniLM-L6-v2"
206
+ ),
207
+ sim_func=cbrkit.sim.embed.cosine() # or dot(), angular(), euclidean(), manhattan()
208
+ )
209
+
210
+ # Using OpenAI embeddings
211
+ openai_sim = cbrkit.sim.embed.build(
212
+ conversion_func=cbrkit.sim.embed.openai(
213
+ model="text-embedding-3-small"
214
+ ),
215
+ sim_func=cbrkit.sim.embed.cosine()
216
+ )
217
+
218
+ # Caching embeddings for performance
219
+ cached_embed_func = cbrkit.sim.embed.cache(
220
+ func=cbrkit.sim.embed.sentence_transformers(
221
+ model="all-MiniLM-L6-v2"
222
+ ),
223
+ path="embeddings_cache.npz",
224
+ autodump=True,
225
+ autoload=True
226
+ )
227
+ cached_sim = cbrkit.sim.embed.build(
228
+ conversion_func=cached_embed_func,
229
+ sim_func=cbrkit.sim.embed.cosine()
230
+ )
231
+ ```
232
+
233
+ #### Taxonomy-Based Similarity
159
234
 
160
235
  ```python
161
- semantic_similarity = cbrkit.sim.strings.spacy(model="en_core_web_lg")
236
+ # Load taxonomy from file
237
+ taxonomy_sim = cbrkit.sim.taxonomy.build(
238
+ path="taxonomy.yaml",
239
+ measure=cbrkit.sim.taxonomy.wu_palmer(),
240
+ )
162
241
  ```
163
242
 
164
- **Please note:** Calling the function `cbrkit.sim.strings.spacy` returns a similarity function itself that has the same signature as the `color_similarity` function defined above.
243
+ #### Utility Functions
244
+
245
+ ```python
246
+ # Combining multiple similarity functions
247
+ combined_sim = cbrkit.sim.combine(
248
+ sim_funcs=[sim1, sim2, sim3],
249
+ aggregator=cbrkit.sim.aggregator(pooling="mean")
250
+ )
251
+
252
+ # Caching similarity results
253
+ cached_sim = cbrkit.sim.cache(base_sim_func)
254
+
255
+ # Transposing similarity functions
256
+ transposed_sim = cbrkit.sim.transpose(
257
+ sim_func=number_sim,
258
+ to_x=lambda s: float(s),
259
+ to_y=lambda s: float(s)
260
+ )
261
+ ```
262
+
263
+ **Please note:** Calling these functions returns a similarity function itself that has the signature `sim = f(x, y)`.
165
264
 
166
265
  An overview of all available similarity measures can be found in the [module documentation](https://wi2trier.github.io/cbrkit/cbrkit/sim.html).
167
266
 
267
+ ### Graph Similarity
268
+
269
+ CBRkit provides extensive support for graph similarity through various algorithms:
270
+
271
+ ```python
272
+ # Using Graph Edit Distance (GED) with A* search
273
+ graph_sim = cbrkit.sim.graphs.astar(
274
+ node_sim=cbrkit.sim.generic.equality(),
275
+ node_matcher=lambda n1, n2: n1 == n2,
276
+ edge_matcher=lambda e1, e2: e1 == e2
277
+ )
278
+ ```
279
+
280
+ Available graph algorithms include:
281
+ - `astar`: A* search for optimal graph edit distance
282
+ - `vf2`: VF2 algorithm for (sub)graph isomorphism
283
+ - `lap`: Linear assignment problem solver
284
+ - `greedy`: Fast greedy matching
285
+ - `brute_force`: Exhaustive search for small graphs
286
+ - `dfs`: Depth-first search based matching
287
+
168
288
  ### Global Similarity and Aggregation
169
289
 
170
290
  When dealing with cases that are not represented through elementary data types like strings, we need to aggregate individual measures to obtain a global similarity score.
@@ -295,9 +415,8 @@ They are provided through **generator functions** that allow you to customize th
295
415
  For example, a number aggregator can be obtained as follows:
296
416
 
297
417
  ```python
298
- # pooling must be a PoolingFunction or one of the provided PoolingNames
299
- pooling = "mean"
300
- number_adapter = cbrkit.adapt.numbers.aggregate(pooling)
418
+ # pooling can be a string like "mean", "min", "max", "sum", etc. or a custom PoolingFunction
419
+ number_adapter = cbrkit.adapt.numbers.aggregate(pooling="mean")
301
420
  ```
302
421
 
303
422
  **Please note:** Calling the function `cbrkit.adapt.numbers.aggregate` returns an adaptation function that takes a collection of values and returns an adapted value.
@@ -351,6 +470,46 @@ result = cbrkit.reuse.apply_query(retrieval_result, query, (reuser1, reuser2))
351
470
 
352
471
  The result structure follows the same pattern as the retrieval results with `final_step` and `steps` attributes.
353
472
 
473
+ ## Advanced Retrieval
474
+
475
+ ### BM25 Retrieval
476
+
477
+ CBRkit includes a BM25 retriever for text-based retrieval:
478
+
479
+ ```python
480
+ retriever = cbrkit.retrieval.bm25(
481
+ key="text_field", # Field to search in
482
+ limit=10
483
+ )
484
+ result = cbrkit.retrieval.apply_query(casebase, query, retriever)
485
+ ```
486
+
487
+ ### Combining Multiple Retrievers
488
+
489
+ The `combine` function allows merging results from multiple retrievers:
490
+
491
+ ```python
492
+ retriever1 = cbrkit.retrieval.build(...)
493
+ retriever2 = cbrkit.retrieval.bm25(...)
494
+
495
+ combined = cbrkit.retrieval.combine(
496
+ retrievers=[retriever1, retriever2],
497
+ aggregator=cbrkit.sim.aggregator(pooling="mean")
498
+ )
499
+ result = cbrkit.retrieval.apply_query(casebase, query, combined)
500
+ ```
501
+
502
+ ### Distributed Processing
503
+
504
+ For large-scale retrieval, use the `distribute` wrapper:
505
+
506
+ ```python
507
+ retriever = cbrkit.retrieval.distribute(
508
+ cbrkit.retrieval.build(...),
509
+ batch_size=1000
510
+ )
511
+ ```
512
+
354
513
  ## Evaluation
355
514
 
356
515
  CBRkit provides evaluation tools through the `cbrkit.eval` module for assessing the quality of retrieval results.
@@ -436,7 +595,8 @@ response = cbrkit.synthesis.apply_result(retrieval, synthesizer).response
436
595
 
437
596
  ### Working with large casebases
438
597
 
439
- Because the built-in `default` and `document_aware` prompt functions include the entire casebase as context, the LLM input can be quite long when working with a large casebase. Because of this, in this case, we recommend transposing the cases (e.g. truncate every case to a fixed length) and/or apply chunking.
598
+ Because the built-in `default` and `document_aware` prompt functions include the entire casebase as context, the LLM input can be quite long when working with a large casebase.
599
+ Because of this, in this case, we recommend transposing the cases (e.g., truncate every case to a fixed length) and/or apply chunking.
440
600
 
441
601
  #### Transposing cases
442
602
 
@@ -449,7 +609,7 @@ from cbrkit.dumpers import json_markdown
449
609
  def encoder(value) -> dict:
450
610
  ...
451
611
  baseprompt = cbrkit.synthesis.prompts.default(instructions, encoder=encoder)
452
- # transform the entries, e.g. by shortening, leaving out irrelevant attributes, etc.
612
+ # transform the entries, e.g., by shortening, leaving out irrelevant attributes, etc.
453
613
  # In this case, the value of every field is trunctated to 100 characters
454
614
  def shorten(entry: dict) -> JsonEntry:
455
615
  entry = {k: str(v)[:100] for k,v in entry.items()}
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "cbrkit"
3
- version = "0.26.4"
3
+ version = "0.27.0"
4
4
  description = "Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI"
5
5
  authors = [{ name = "Mirko Lenz", email = "mirko@mirkolenz.com" }]
6
6
  readme = "README.md"
@@ -58,6 +58,7 @@ api = [
58
58
  "python-multipart>=0.0.15,<1",
59
59
  "uvicorn[standard]>=0.30,<1",
60
60
  ]
61
+ chunking = ["chonkie>=1,<2"]
61
62
  cli = ["typer>=0.9,<1"]
62
63
  eval = ["ranx>=0.3,<1"]
63
64
  graphs = ["networkx>=3,<4", "rustworkx>=0.15,<1"]
@@ -244,7 +244,7 @@ def kendall_tau(
244
244
  qrel_relevant = {k for k, v in qrels[key].items() if v >= relevance_level}
245
245
  sorted_qrel_relevant = sorted(qrel_relevant, key=lambda x: qrels[key][x])
246
246
 
247
- sorted_run = sorted(run.keys(), key=lambda x: run[key][x], reverse=True)
247
+ sorted_run = sorted(run[key].keys(), key=lambda x: run[key][x], reverse=True)
248
248
  run_k = sorted_run[: k if k > 0 else len(sorted_run)]
249
249
 
250
250
  max_idx = min(len(run_k), len(sorted_qrel_relevant))
@@ -1,7 +1,7 @@
1
1
  from collections.abc import Sequence
2
2
  from typing import Any, Literal
3
3
 
4
- from ..helpers import round, scale, unpack_float
4
+ from ..helpers import normalize_and_scale, round, unpack_float
5
5
  from ..retrieval import Result, ResultStep
6
6
  from ..typing import EvalMetricFunc, Float, QueryCaseMatrix
7
7
  from .common import DEFAULT_METRICS, compute
@@ -65,12 +65,10 @@ def retrieval_step_to_qrels[Q, C, S: Float](
65
65
  min_sim = 0.0
66
66
  max_sim = 1.0
67
67
 
68
- qrel_factor = max_qrel - min_qrel
69
-
70
68
  return {
71
69
  query: {
72
70
  case: round(
73
- scale(sim, min_sim, max_sim) * qrel_factor + min_qrel,
71
+ normalize_and_scale(sim, min_sim, max_sim, min_qrel, max_qrel),
74
72
  round_mode,
75
73
  )
76
74
  for case, sim in entry.items()
@@ -71,6 +71,8 @@ __all__ = [
71
71
  "load_callables_map",
72
72
  "load_callables",
73
73
  "load_object",
74
+ "normalize",
75
+ "normalize_and_scale",
74
76
  "log_batch",
75
77
  "mp_count",
76
78
  "mp_map",
@@ -605,6 +607,30 @@ def scale(value: float, lower: float, upper: float) -> float:
605
607
  return value * (upper - lower) + lower
606
608
 
607
609
 
610
+ def normalize(value: float, value_min: float, value_max: float) -> float:
611
+ """Normalize a value from [value_min, value_max] to [0, 1]."""
612
+ if value_max == value_min:
613
+ # Handle edge case where all values are identical
614
+ return 0.0
615
+
616
+ return (value - value_min) / (value_max - value_min)
617
+
618
+
619
+ def normalize_and_scale(
620
+ value: float,
621
+ value_min: float,
622
+ value_max: float,
623
+ target_min: float,
624
+ target_max: float,
625
+ ) -> float:
626
+ """Normalize a value from [value_min, value_max] to [target_min, target_max]."""
627
+ # First normalize to [0, 1]
628
+ normalized = normalize(value, value_min, value_max)
629
+
630
+ # Then scale to target range
631
+ return scale(normalized, target_min, target_max)
632
+
633
+
608
634
  def load_object(import_name: str) -> Any:
609
635
  """Import an object based on a string.
610
636
 
@@ -3,6 +3,9 @@ from ..model import QueryResultStep, Result, ResultStep
3
3
  from .apply import apply_batches, apply_queries, apply_query
4
4
  from .build import build, combine, distribute, dropout, transpose, transpose_value
5
5
 
6
+ with optional_dependencies():
7
+ from .build import chunk
8
+
6
9
  with optional_dependencies():
7
10
  from .rerank import cohere
8
11
 
@@ -22,6 +25,7 @@ __all__ = [
22
25
  "dropout",
23
26
  "distribute",
24
27
  "combine",
28
+ "chunk",
25
29
  "apply_batches",
26
30
  "apply_queries",
27
31
  "apply_query",
@@ -12,6 +12,7 @@ from ..helpers import (
12
12
  mp_count,
13
13
  mp_map,
14
14
  mp_starmap,
15
+ optional_dependencies,
15
16
  sim_map2ranking,
16
17
  unpack_float,
17
18
  use_mp,
@@ -315,3 +316,47 @@ class build[K, V, S: Float](RetrieverFunc[K, V, S]):
315
316
  similarities[idx][key] = sim
316
317
 
317
318
  return similarities
319
+
320
+
321
+ with optional_dependencies():
322
+ from chonkie import BaseChunker
323
+
324
+ @dataclass(slots=True, frozen=True)
325
+ class chunk[S: Float](RetrieverFunc[str, str, S]):
326
+ """Chunks string cases using the chonkie library before retrieval.
327
+
328
+ This retriever is special in that it returns a different set of cases for each batch
329
+ it processes, as it splits the original string cases into chunks.
330
+
331
+ Args:
332
+ retriever_func: The retriever function to be used on the chunked strings.
333
+ chunker: A BaseChunker instance from the chonkie library.
334
+
335
+ Returns:
336
+ A retriever function that chunks string cases and retrieves from the chunks.
337
+ """
338
+
339
+ retriever_func: RetrieverFunc[str, str, S]
340
+ chunker: BaseChunker
341
+
342
+ @override
343
+ def __call__(
344
+ self, batches: Sequence[tuple[Casebase[str, str], str]]
345
+ ) -> Sequence[SimMap[str, S]]:
346
+ chunked_batches: list[tuple[Casebase[str, str], str]] = []
347
+
348
+ for casebase, query in batches:
349
+ chunked_casebase: dict[str, str] = {}
350
+
351
+ for case_key, case_text in casebase.items():
352
+ chunks = self.chunker.chunk(case_text)
353
+
354
+ for i, chunk in enumerate(chunks):
355
+ chunk_key = f"{case_key}-chunk{i}"
356
+ chunked_casebase[chunk_key] = (
357
+ chunk if isinstance(chunk, str) else chunk.text
358
+ )
359
+
360
+ chunked_batches.append((chunked_casebase, query))
361
+
362
+ return self.retriever_func(chunked_batches)
@@ -9,8 +9,9 @@ there is also a measure for attribute-value data.
9
9
  Additionally, the module contains an aggregator to combine multiple local measures into a global score.
10
10
  """
11
11
 
12
- from . import collections, embed, generic, graphs, numbers, strings, taxonomy
13
- from .aggregator import PoolingName, aggregator
12
+ from . import collections, embed, generic, graphs, numbers, pooling, strings, taxonomy
13
+ from .aggregator import aggregator
14
+ from .pooling import PoolingName
14
15
  from .attribute_value import AttributeValueSim, attribute_value
15
16
  from .wrappers import (
16
17
  attribute_table,
@@ -40,6 +41,7 @@ __all__ = [
40
41
  "graphs",
41
42
  "embed",
42
43
  "taxonomy",
44
+ "pooling",
43
45
  "aggregator",
44
46
  "PoolingName",
45
47
  "AttributeValueSim",